In [1]:
import pandas as pd

In [2]:
with open("../data/conll003-englishversion/train/train.txt", 'r', encoding='utf-8') as file:
    data = file.read()

In [13]:
with open("../data/conll003-englishversion/test/test.txt", 'r', encoding='utf-8') as file:
    data = file.read()

In [3]:
def parse_conll(data_string):
    data = []
    sentence_id = 0
    
    lines = data_string.strip().split('\n')
    
    for line in lines:
        line = line.strip()
        
        if not line:
            sentence_id += 1
            continue
            
        if line.startswith("-DOCSTART-"):
            continue
            
        parts = line.split()
        
        if len(parts) >= 4:
            data.append({
                'sentence_id': sentence_id,
                'word': parts[0],
                'pos_tag': parts[1],   
                'chunk_tag': parts[2], 
                'ner_tag': parts[3]    
            })
            
    return pd.DataFrame(data)


df = parse_conll(data)
print(df.head(10))

   sentence_id     word pos_tag chunk_tag ner_tag
0            1       EU     NNP      B-NP   B-ORG
1            1  rejects     VBZ      B-VP       O
2            1   German      JJ      B-NP  B-MISC
3            1     call      NN      I-NP       O
4            1       to      TO      B-VP       O
5            1  boycott      VB      I-VP       O
6            1  British      JJ      B-NP  B-MISC
7            1     lamb      NN      I-NP       O
8            1        .       .         O       O
9            2    Peter     NNP      B-NP   B-PER


In [4]:
df['ner_tag'].unique()

array(['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG',
       'I-MISC', 'I-LOC'], dtype=object)

In [8]:
df.to_csv('../data/conll003-englishversion/train/train.csv')

In [17]:
df.to_csv('../data/conll003-englishversion/test/test.csv')

In [5]:
def format_data_for_llm(df: pd.DataFrame) -> list[dict]:
    sentences = []
    
    grouped = df.groupby('sentence_id')
    
    for _, group in grouped:
        words = list(group['word'])
        tags = list(group['ner_tag'])
        
        sentences.append({
            'words': words,
            'tags': tags
        })
        
    return sentences

In [8]:

llm_data = format_data_for_llm(df)
llm_data[0]

{'words': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'tags': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}

In [20]:
data_for_df = []

for i, sentence_dict in enumerate(llm_data):
    joined_sentence = " ".join(sentence_dict['words'])
    joined_tags = " ".join(sentence_dict['tags'])
    
    data_for_df.append({
        'sentence_id': i + 1,
        'sentence_text': joined_sentence,
        'ner_tags_str': joined_tags,
    })

df_sentences = pd.DataFrame(data_for_df)
df_sentences

Unnamed: 0,sentence_id,sentence_text,ner_tags_str
0,1,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...",O O B-LOC O O O O B-PER O O O O
1,2,Nadim Ladki,B-PER I-PER
2,3,"AL-AIN , United Arab Emirates 1996-12-06",B-LOC O B-LOC I-LOC I-LOC O
3,4,Japan began the defence of their Asian Cup tit...,B-LOC O O O O O B-MISC I-MISC O O O O O O O B-...
4,5,But China saw their luck desert them in the se...,O B-LOC O O O O O O O O O O O O O O O O O O O ...
...,...,...,...
3448,3449,That is why this is so emotional a night for m...,O O O O O O O O O O O O O B-PER O O
3449,3450,""" It was the joy that we all had over the peri...",O O O O O O O O O O O O O O O O O O O O O O O ...
3450,3451,"Charlton managed Ireland for 93 matches , duri...",B-PER O B-LOC O O O O O O O O O O O O O O O O ...
3451,3452,He guided Ireland to two successive World Cup ...,O O B-LOC O O O B-MISC I-MISC O O O O O O B-MI...


In [12]:
df_sentences.to_csv('train_for_llm.csv')

In [21]:
df_sentences.to_csv('test_for_llm.csv')

In [24]:
df_test = pd.read_csv('../data/conll003-englishversion/test/test_for_llm.csv')
df_train = pd.read_csv('../data/conll003-englishversion/train/train_for_llm.csv')


In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14041 entries, 0 to 14040
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     14041 non-null  int64 
 1   sentence_id    14041 non-null  int64 
 2   sentence_text  14041 non-null  object
 3   ner_tags_str   14041 non-null  object
dtypes: int64(2), object(2)
memory usage: 438.9+ KB
