In [4]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Data preparation for model training

In [62]:
data = pd.read_csv('keywords_dataset.csv')
data.head()

Unnamed: 0,Sentence #,Word,Tag;
0,112,Gebäudevermessung,B-KEY;
1,112,von,O;
2,112,Mehrfamilienhäusern,O;
3,"112,"","",O;",,
4,112,Einkaufszentren,O;


Some lines like the 3rd one (_112,",",O;_) weren't processed correctly. Therefore, we need to fix such items in the source data first.

In [135]:
with open('keywords_dataset.csv', 'r', encoding='utf8') as fin:
    source_data = fin.read()
    
fixed_data = re.sub(r'"(\d+),""([^"]+)"",(\w+)";', r'\1,"\2",\3;', source_data)
fixed_data = re.sub(r';', r'', fixed_data) # there is no need semilocon in labels
fixed_data = re.sub(r'\u200b', '', fixed_data) # clean up some trash symbols
fixed_data = re.sub(r'\x97', '', fixed_data)
fixed_data = re.sub(r'•', '', fixed_data)
spl_data = fixed_data.split('\n')

with open('keywords_dataset_fixed.csv', 'w', encoding='utf8') as fout:
    for line in spl_data:
        fout.write(line + '\n')

Now let's re-read the data in pandas and prepare it for model training

In [18]:
data = pd.read_csv('keywords_dataset_fixed.csv')
data['Sentence #'] = data['Sentence #'].astype('object')
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,112,Gebäudevermessung,B-KEY
1,112,von,O
2,112,Mehrfamilienhäusern,O
3,112,",",O
4,112,Einkaufszentren,O


In [19]:
new_df = data.groupby(by='Sentence #').agg(list)
df = new_df[new_df['Word'].str.len() < 50] # filter out too long sequences as BERT can process only 512 subtokens
df.head()

Unnamed: 0_level_0,Word,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[Die, Referenzen, der, 1000hands, AG, -, Wir, ...","[O, O, O, O, O, O, O, O, O, O, O, B-KEY, B-KEY..."
1,"[Die, folgenden, Referenzen, sind, ein, kleine...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[Sie, stellen, einen, Querschnitt, unseres, Le...","[O, O, O, O, O, O, O, O]"
3,"[Deutsche, Bank, Aufmaß, von, 1,6, Mio, qm, BG...","[O, O, O, O, O, O, O, O, O]"
4,"[CAD-Bearbeitung, in, AutoCAD, .]","[O, O, O, O]"


In [20]:
train, dev, test = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9*len(df))]) # train_dev_test split = 0.8, 0.1, 0.1

In [22]:
with open('kws_data/train.txt', 'w', encoding = 'utf8') as fout:
    for i, row in train.iterrows():
        for token, label in zip(row['Word'], row['Tag']):
            fout.write(str(token).strip() + '\t' + str(label).strip() + '\n')
        fout.write('\n')

In [23]:
with open('kws_data/valid.txt', 'w', encoding = 'utf8') as fout:
    for i, row in dev.iterrows():
        for token, label in zip(row['Word'], row['Tag']):
            fout.write(str(token).strip() + '\t' + str(label).strip() + '\n')
        fout.write('\n')

In [24]:
with open('kws_data/test.txt', 'w', encoding = 'utf8') as fout:
    for i, row in test.iterrows():
        for token, label in zip(row['Word'], row['Tag']):
            fout.write(str(token).strip() + '\t' + str(label).strip() + '\n')
        fout.write('\n')