# Random Forest NER

In [1]:
import random
from faker import Faker

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.preprocessing import LabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

## Generate data

In [2]:
LOCS = ['fr_FR', 'fr_FR', 'fr_FR', 'en_US', 'en_GB', 'de_DE', 'fr_CH', 'nl_BE', 'it_IT', 'es_ES']
fake = {loc:Faker(loc) for loc in LOCS}
Faker.seed(1909)

DATASET_SIZE = 10000

In [3]:
adrs = []

for i in range(DATASET_SIZE):
    words, tags = [],[]
    loc = random.sample(LOCS,1)[0]
    
    for f in fake[loc].iban().split():
        if random.random()>0.2:
            if random.random()>0.9:
                words.append(f[:2])
                tags.append('IBAN')
                words.append(f[2:6])
                tags.append('IBAN')
                words.append(f[6:10])
                tags.append('IBAN')
                words.append(f[10:])
                tags.append('IBAN')
            else:
                words.append(f)
                tags.append('IBAN')
        
    if random.random() > 0.5:
        if random.random() > 0.5:
            for f in fake[loc].name().split():
                words.append(f)
                tags.append('NAME')
        else:
            for f in fake[loc].name().split():
                words.append(f.lower())
                tags.append('NAME')
    else:
        for f in fake[loc].company().split():
            words.append(f)
            tags.append('ORG')
        
    for f in fake[loc].address().split():
        if random.random()>0.1:
            words.append(f)
            tags.append('ADDRESS')
    
    if random.random()>0.1:
        words.append(loc[-2:])
        tags.append('COUNTRY')
        
    adrs.append((words,tags))

## Building features

In [4]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [5]:
data = []
for i,adr in enumerate(adrs):
    for ii,(text,tag) in enumerate(zip(adr[0],adr[1])):
        d = {
            "adr_idx":i,
            "tag":tag,
            "text":text,  
            "text_pos":ii,
            "adr_len": len(adr[0]),
            "pos_percent": ii/len(adr[0]),
            "islower":text.islower(),
            "isupper":text.isupper(),
            "istitle":text.istitle(),
            "isdigit":text.isdigit(),
            "isalpha":text.isalpha(),
            "hasDigit":has_numbers(text),
            "len":len(text)
        }
        data.append(d)
df = pd.DataFrame(data)
df

Unnamed: 0,adr_idx,tag,text,text_pos,adr_len,pos_percent,islower,isupper,istitle,isdigit,isalpha,hasDigit,len
0,0,ORG,"Sommaruga,",0,13,0.000000,False,False,True,False,False,False,10
1,0,ORG,Persico,1,13,0.076923,False,False,True,False,True,False,7
2,0,ORG,e,2,13,0.153846,True,False,False,False,True,False,1
3,0,ORG,Turchetta,3,13,0.230769,False,False,True,False,True,False,9
4,0,ORG,s.r.l.,4,13,0.307692,True,False,False,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97742,9999,ADDRESS,Boucseinweg,4,9,0.444444,False,False,True,False,True,False,11
97743,9999,ADDRESS,24,5,9,0.555556,False,False,False,True,False,True,2
97744,9999,ADDRESS,24947,6,9,0.666667,False,False,False,True,False,True,5
97745,9999,ADDRESS,Uffenheim,7,9,0.777778,False,False,True,False,True,False,9


## Baseline 

In [6]:
X = df[['text_pos', 'adr_len', 'pos_percent',
       'islower', 'isupper', 'istitle', 'isdigit', 
        'hasDigit', 'len']]
y = df['tag']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

In [8]:
lb = LabelBinarizer()
y_lb = lb.fit_transform(y)

clf = MultiOutputClassifier(linear_model.LogisticRegression(max_iter=3000))
scores = cross_val_score(clf, X, y_lb, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.786878412027538  +/-  0.0031573749213205746


In [9]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9101353678912154  +/-  0.002280274647027719


In [10]:
clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)

In [11]:
df_stats = pd.DataFrame({'preds':test_preds, 'targets':y_test})
df_stats["success"] = pd.to_numeric(df_stats.preds == df_stats.targets)
df_stats.groupby('preds').mean()

  df_stats.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.95037
COUNTRY,0.997775
IBAN,0.991098
NAME,0.782632
ORG,0.697701


## Improving the perfs

The idea is to take in consideration previous and next tags to give some context by providing these info to a second random forest

In [12]:
train_preds = clf.predict(X_train)
train_preds

array(['ORG', 'ORG', 'ORG', ..., 'IBAN', 'ORG', 'ORG'], dtype=object)

In [13]:
X2_train = X_train.copy()
X2_train['pred_TAG'] = train_preds

X2_train["prev_tag"] = np.concatenate((["START"], train_preds[:-1]))
X2_train.loc[X2_train.text_pos==0,"prev_tag"] = "START"

X2_train["next_tag"] = np.concatenate((train_preds[1:], ["END"]))
X2_train.loc[X2_train.text_pos==X2_train.adr_len,"next_tag"] = "END"

dummies = pd.get_dummies(X2_train[["pred_TAG","prev_tag","next_tag"]])
X2_train = pd.concat((X2_train,dummies), axis=1)
X2_train = X2_train.drop(columns=["pred_TAG","prev_tag","next_tag"])

In [14]:
clf2 = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf2, X2_train, y_train, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9427343823208062  +/-  0.0018838214630027244


In [15]:
clf3 = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf3, dummies, y_train, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.927081628671452  +/-  0.0022024727551860266


### Validating on test data

In [16]:
test_preds = clf.predict(X_test)

In [17]:
X2_test = X_test.copy()
X2_test['pred_TAG'] = test_preds

X2_test["prev_tag"] = np.concatenate((["START"], test_preds[:-1]))
X2_test.loc[X2_test.text_pos==0,"prev_tag"] = "START"

X2_test["next_tag"] = np.concatenate((test_preds[1:], ["END"]))
X2_test.loc[X2_test.text_pos==X2_test.adr_len,"next_tag"] = "END"

test_dummies = pd.get_dummies(X2_test[["pred_TAG","prev_tag","next_tag"]])
X2_test = pd.concat((X2_test,test_dummies), axis=1)
X2_test = X2_test.drop(columns=["pred_TAG","prev_tag","next_tag"])

In [18]:
clf2.fit(X2_train, y_train)
test_preds = clf2.predict(X2_test)
accuracy_score(y_test, test_preds)

0.9380051150895141

In [19]:
clf3.fit(dummies, y_train)
test_preds = clf3.predict(test_dummies)
accuracy_score(y_test, test_preds)

0.9203580562659847

In [20]:
df_stats = pd.DataFrame({'preds':test_preds, 'targets':y_test})
df_stats["success"] = pd.to_numeric(df_stats.preds == df_stats.targets)
df_stats.groupby('preds').mean()

  df_stats.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.963224
COUNTRY,0.997775
IBAN,1.0
NAME,0.791279
ORG,0.727169


## advanced: with some context

In [21]:
data = []
for i,adr in enumerate(adrs):
    for ii,(text,tag) in enumerate(zip(adr[0],adr[1])):
        d = {
            "adr_idx":i,
            "tag":tag,
            "text":text,  
            "text_pos":ii,
            "adr_len": len(adr[0]),
            "pos_percent": ii / len(adr[0]),
            "islower":text.islower(),
            "isupper":text.isupper(),
            "istitle":text.istitle(),
            "isdigit":text.isdigit(),
            "isalpha":text.isalpha(),
            "hasDigit":has_numbers(text),
            "len":len(text),
            "text_beg":text[:2],
            "text_end":text[-3:],
        }
        data.append(d)
df = pd.DataFrame(data)
df

Unnamed: 0,adr_idx,tag,text,text_pos,adr_len,pos_percent,islower,isupper,istitle,isdigit,isalpha,hasDigit,len,text_beg,text_end
0,0,ORG,"Sommaruga,",0,13,0.000000,False,False,True,False,False,False,10,So,"ga,"
1,0,ORG,Persico,1,13,0.076923,False,False,True,False,True,False,7,Pe,ico
2,0,ORG,e,2,13,0.153846,True,False,False,False,True,False,1,e,e
3,0,ORG,Turchetta,3,13,0.230769,False,False,True,False,True,False,9,Tu,tta
4,0,ORG,s.r.l.,4,13,0.307692,True,False,False,False,False,False,6,s.,.l.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97742,9999,ADDRESS,Boucseinweg,4,9,0.444444,False,False,True,False,True,False,11,Bo,weg
97743,9999,ADDRESS,24,5,9,0.555556,False,False,False,True,False,True,2,24,24
97744,9999,ADDRESS,24947,6,9,0.666667,False,False,False,True,False,True,5,24,947
97745,9999,ADDRESS,Uffenheim,7,9,0.777778,False,False,True,False,True,False,9,Uf,eim


In [22]:
prev_df = df.shift(1).add_prefix("prev_").iloc[:,2:]
next_df = df.shift(-1).add_prefix("next_").iloc[:,2:]

In [23]:
df2 = pd.concat((df,prev_df,next_df), axis=1)

In [24]:
df2.loc[df2.text_pos==0,prev_df.columns] = ["XXX",-1,-1,-1,False,False,False,False,False,False,-1,"XXX","XXX"]
df2.loc[df2.text_pos==df2.adr_len-1,next_df.columns] = ["XXX",-1,-1,-1,False,False,False,False,False,False,-1,"XXX","XXX"]

In [25]:
df2.head()

Unnamed: 0,adr_idx,tag,text,text_pos,adr_len,pos_percent,islower,isupper,istitle,isdigit,...,next_pos_percent,next_islower,next_isupper,next_istitle,next_isdigit,next_isalpha,next_hasDigit,next_len,next_text_beg,next_text_end
0,0,ORG,"Sommaruga,",0,13,0.0,False,False,True,False,...,0.076923,False,False,True,False,True,False,7.0,Pe,ico
1,0,ORG,Persico,1,13,0.076923,False,False,True,False,...,0.153846,True,False,False,False,True,False,1.0,e,e
2,0,ORG,e,2,13,0.153846,True,False,False,False,...,0.230769,False,False,True,False,True,False,9.0,Tu,tta
3,0,ORG,Turchetta,3,13,0.230769,False,False,True,False,...,0.307692,True,False,False,False,False,False,6.0,s.,.l.
4,0,ORG,s.r.l.,4,13,0.307692,True,False,False,False,...,0.384615,False,False,True,False,False,False,8.0,Ro,"do,"


In [26]:
X = df2.iloc[:,2:]
y = df2['tag']

In [27]:
TEXT_COLS = X.columns[["text" in c for c in X.columns]]
NUM_COLS = X.columns[["text" not in c for c in X.columns]]
ct = ColumnTransformer(
     [("cat_preprocess", OneHotEncoder(max_categories=10), TEXT_COLS),
      ("num_preprocess", MinMaxScaler(), NUM_COLS)])
X_prep = ct.fit_transform(X)

In [28]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, X_prep, y, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9730221889804346  +/-  0.0015374830497882344


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_prep, y, test_size=0.2, shuffle=False, random_state=42)

In [30]:
clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)

In [31]:
df_stats = pd.DataFrame({'preds':test_preds, 'targets':y_test})
df_stats["success"] = pd.to_numeric(df_stats.preds == df_stats.targets)
df_stats.groupby('preds').mean()

  df_stats.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.985895
COUNTRY,0.998329
IBAN,1.0
NAME,0.908323
ORG,0.926388
