In [118]:
import numpy as np
import pandas as pd

Steps
1. import data
2. split the sentences into words
3. Tag each word (with B0, learn from [here](https://www.depends-on-the-definition.com/guide-sequence-tagging-neural-networks-python/))
4. Create a baseline from [here](https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/)
5. Create a real model. Try a RandomForest Classifier first, and only move on to Neural Networks if it overfits

In [119]:
df = pd.read_csv('../../data/dataset_from_json.csv')
df.head()

Unnamed: 0,words,sentence #,tag
0,CPT,0,Incoterms
1,LAHORE,0,Incoterms
2,AIRPORT,0,Incoterms
3,PAKISTAN,0,Incoterms
4,QTY,0,O


In [120]:
len(set(df.words)), len(df)

(9359, 60340)

In [121]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False

    def get_next(self):
        try:
            s = self.data[self.data["sentence #"] == self.n_sent]
            self.n_sent += 1
            return s["words"].values.tolist(), s["tag"].values.tolist()
        except:
            self.empty = True
            return None, None

In [122]:
getter = SentenceGetter(df)

In [123]:
sent, tag = getter.get_next()
print(sent)
print(tag)

['CPT', 'LAHORE', 'AIRPORT', 'PAKISTAN', 'QTY', '1', 'PC', 'OF', 'MULTI', 'MODE', 'READER/TRINOCULAR', 'MICROSCOPE', 'SYSTEM', 'MODEL']
['Incoterms', 'Incoterms', 'Incoterms', 'Incoterms', 'O', 'Quantity', 'Quantity', 'O', 'GoodsDescription', 'GoodsDescription', 'GoodsDescription', 'GoodsDescription', 'GoodsDescription', 'O']


In [124]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
words = list(set(df["words"].values))
words.append("ENDPAD")
word2idx = {w: i for i, w in enumerate(words)}
w2idx = [word2idx[word] for word in df['words']]

# create more features
def feature_map(word):
    return np.array([word2idx[word], word.isupper(), len(word), word.isdigit(), word.isalpha()])

In [126]:
new_words = [feature_map(w) for w in df['words'].values.tolist()]
new_words[:5]

[array([1492,    1,    3,    0,    1]),
 array([4003,    1,    6,    0,    1]),
 array([3501,    1,    7,    0,    1]),
 array([6342,    1,    8,    0,    1]),
 array([5242,    1,    3,    0,    1])]

In [127]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report


# clf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=42)
# rf_classifier = cross_val_predict(clf,
#                                   X=new_words,
#                                   y=df['tag'].values.tolist())

In [128]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30, 100], 'max_features': [2, 4, 6]},
    {'bootstrap': [False, True], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestClassifier()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5)

X = np.array(new_words)
y = np.array(df['tag'].values)

grid_search.fit(X, y)

In [136]:
clf = grid_search.best_estimator_
clf.fit(X, y)

In [143]:
report = classification_report(y_pred=clf.predict(X), y_true=y, zero_division=1)
print(report)

                  precision    recall  f1-score   support

GoodsDescription       0.90      0.82      0.86      9463
     GoodsOrigin       0.95      0.80      0.87        46
          HSCode       1.00      0.77      0.87        31
       Incoterms       0.96      0.92      0.94      7633
               O       0.93      0.94      0.93     30686
        Quantity       0.79      0.84      0.82      4524
       Tolerance       0.56      0.41      0.47       177
 UnitPriceAmount       0.87      0.90      0.88      7780

        accuracy                           0.91     60340
       macro avg       0.87      0.80      0.83     60340
    weighted avg       0.91      0.91      0.91     60340



In [144]:
import joblib

joblib.dump(clf, "../../models/random_forest/rf.joblib")

['rf.joblib']