In [19]:
import numpy as np
import pandas as pd

Steps
1. import data
2. split the sentences into words
3. Tag each word (with B0, learn from [here](https://www.depends-on-the-definition.com/guide-sequence-tagging-neural-networks-python/))
4. Create a baseline from [here](https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/)
5. Create a real model. Try a RandomForest Classifier first, and only move on to Neural Networks if it overfits

In [20]:
df = pd.read_csv('../data/final_dataset.csv')
df.head()

Unnamed: 0,words,sentence #,tag
0,FOB,0,O
1,MULTI,0,O
2,CODES,0,O
3,DENIM,0,O
4,FABRIC,0,O


In [21]:
len(set(df.words)), len(df)

(8819, 62354)

In [22]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False

    def get_next(self):
        try:
            s = self.data[self.data["sentence #"] == self.n_sent]
            self.n_sent += 1
            return s["words"].values.tolist(), s["tag"].values.tolist()
        except:
            self.empty = True
            return None, None

In [23]:
getter = SentenceGetter(df)

In [24]:
sent, tag = getter.get_next()
print(sent)
print(tag)

['FOB', 'MULTI', 'CODES', 'DENIM', 'FABRIC', 'AS', 'PER', 'PROFORMA', 'INVOICE', 'NO.', 'HTX0300/2018', 'DATED', '01.12.18']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [25]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
words = list(set(df["words"].values))
words.append("ENDPAD")
word2idx = {w: i for i, w in enumerate(words)}
w2idx = [word2idx[word] for word in df['words']]

# create more features
def feature_map(word):
    return np.array([word2idx[word], word.isupper(), len(word), word.isdigit(), word.isalpha()])

In [82]:
new_words = [feature_map(w) for w in df['words'].values.tolist()]
new_words[:5]

[array([4825,    1,    3,    0,    1]),
 array([6441,    1,    5,    0,    1]),
 array([313,   1,   5,   0,   1]),
 array([926,   1,   5,   0,   1]),
 array([4711,    1,    6,    0,    1])]

In [83]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

rf_classifier = cross_val_predict(RandomForestClassifier(n_estimators=20),
                                  X=new_words,
                                  y=df['tag'].values.tolist())

In [84]:
report = classification_report(y_pred=rf_classifier, y_true=df['tag'].values.tolist(), zero_division=1)
print(report)

              precision    recall  f1-score   support

        B-NW       0.62      0.56      0.59      2037
        I-NW       0.81      0.70      0.75      2321
           O       0.97      0.98      0.98     57996

    accuracy                           0.96     62354
   macro avg       0.80      0.75      0.77     62354
weighted avg       0.96      0.96      0.96     62354

