# Random Forest NER

In [32]:
import random
from faker import Faker

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.preprocessing import LabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

## Generate data

In [2]:
LOCS = ['fr_FR', 'fr_FR', 'fr_FR', 'en_US', 'en_GB', 'de_DE', 'fr_CH', 'nl_BE', 'it_IT', 'es_ES']
fake = {loc:Faker(loc) for loc in LOCS}
Faker.seed(1909)

DATASET_SIZE = 10000

In [3]:
adrs = []

for i in range(DATASET_SIZE):
    words, tags = [],[]
    loc = random.sample(LOCS,1)[0]
    
    for f in fake[loc].iban().split():
        if random.random()>0.2:
            words.append(f)
            tags.append('IBAN')
        
    if random.random() > 0.5:
        for f in fake[loc].name().split():
            words.append(f)
            tags.append('NAME')
    else:
        for f in fake[loc].company().split():
            words.append(f)
            tags.append('ORG')
        
    for f in fake[loc].address().split():
        if random.random()>0.1:
            words.append(f)
            tags.append('ADDRESS')
    
    if random.random()>0.1:
        words.append(loc[-2:])
        tags.append('COUNTRY')
        
    adrs.append((words,tags))

## Building features

In [4]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [5]:
data = []
for i,adr in enumerate(adrs):
    for ii,(text,tag) in enumerate(zip(adr[0],adr[1])):
        d = {
            "adr_idx":i,
            "tag":tag,
            "text":text,  
            "text_pos":ii,
            "adr_len": len(adr[0]),
            "pos_percent": ii/len(adr[0]),
            "islower":text.islower(),
            "isupper":text.isupper(),
            "istitle":text.istitle(),
            "isdigit":text.isdigit(),
            "hasDigit":has_numbers(text),

            "len":len(text)
        }
        data.append(d)
df = pd.DataFrame(data)
df

Unnamed: 0,adr_idx,tag,text,text_pos,adr_len,pos_percent,islower,isupper,istitle,isdigit,hasDigit,len
0,0,IBAN,FR3242328429469013406958049,0,9,0.000000,False,True,False,False,True,27
1,0,NAME,Valérie-Nathalie,1,9,0.111111,False,False,True,False,False,16
2,0,NAME,Potier,2,9,0.222222,False,False,True,False,False,6
3,0,ADDRESS,37,3,9,0.333333,False,False,False,False,True,3
4,0,ADDRESS,avenue,4,9,0.444444,True,False,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...
95175,9999,ADDRESS,rue,3,8,0.375000,True,False,False,False,False,3
95176,9999,ADDRESS,Lebrun,4,8,0.500000,False,False,True,False,False,6
95177,9999,ADDRESS,08318,5,8,0.625000,False,False,False,True,True,5
95178,9999,ADDRESS,Pottier,6,8,0.750000,False,False,True,False,False,7


## Baseline 

In [7]:
X = df[['text_pos', 'adr_len', 'pos_percent',
       'islower', 'isupper', 'istitle', 'isdigit', 
        'hasDigit', 'len']]
y = df['tag']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

In [37]:
lb = LabelBinarizer()
y_lb = lb.fit_transform(y)

clf = MultiOutputClassifier(linear_model.LogisticRegression(max_iter=3000))
scores = cross_val_score(clf, X, y_lb, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.7637423828535407  +/-  0.0015493302179437984


In [9]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.8942004622819921  +/-  0.002441694855542609


In [22]:
clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)

In [23]:
df_stats = pd.DataFrame({'preds':test_preds, 'targets':y_test})
df_stats["success"] = pd.to_numeric(df_stats.preds == df_stats.targets)
df_stats.groupby('preds').mean()

  df_stats.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.960694
COUNTRY,1.0
IBAN,1.0
NAME,0.602528
ORG,0.749424


## Improving the perfs

The idea is to take in consideration previous and next tags to give some context by providing these info to a second random forest

In [10]:
train_preds = clf.predict(X_train)
train_preds

array(['IBAN', 'NAME', 'NAME', ..., 'IBAN', 'NAME', 'ORG'], dtype=object)

In [11]:
X2_train = X_train.copy()
X2_train['pred_TAG'] = train_preds

X2_train["prev_tag"] = np.concatenate((["START"], train_preds[:-1]))
X2_train.loc[X2_train.text_pos==0,"prev_tag"] = "START"

X2_train["next_tag"] = np.concatenate((train_preds[1:], ["END"]))
X2_train.loc[X2_train.text_pos==X2_train.adr_len,"next_tag"] = "END"

dummies = pd.get_dummies(X2_train[["pred_TAG","prev_tag","next_tag"]])
X2_train = pd.concat((X2_train,dummies), axis=1)
X2_train = X2_train.drop(columns=["pred_TAG","prev_tag","next_tag"])

In [12]:
clf2 = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf2, X2_train, y_train, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9356744696667734  +/-  0.0024257988952104004


In [13]:
clf3 = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf3, dummies, y_train, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9271248761336495  +/-  0.003177057488737036


### Validating on test data

In [14]:
test_preds = clf.predict(X_test)

In [15]:
X2_test = X_test.copy()
X2_test['pred_TAG'] = test_preds

X2_test["prev_tag"] = np.concatenate((["START"], test_preds[:-1]))
X2_test.loc[X2_test.text_pos==0,"prev_tag"] = "START"

X2_test["next_tag"] = np.concatenate((test_preds[1:], ["END"]))
X2_test.loc[X2_test.text_pos==X2_test.adr_len,"next_tag"] = "END"

test_dummies = pd.get_dummies(X2_test[["pred_TAG","prev_tag","next_tag"]])
X2_test = pd.concat((X2_test,test_dummies), axis=1)
X2_test = X2_test.drop(columns=["pred_TAG","prev_tag","next_tag"])

In [16]:
clf2.fit(X2_train, y_train)
test_preds = clf2.predict(X2_test)
accuracy_score(y_test, test_preds)

0.9357007774742593

In [18]:
clf3.fit(dummies, y_train)
test_preds = clf3.predict(test_dummies)
accuracy_score(y_test, test_preds)

0.9244589199411641

In [21]:
df_stats = pd.DataFrame({'preds':test_preds, 'targets':y_test})
df_stats["success"] = pd.to_numeric(df_stats.preds == df_stats.targets)
df_stats.groupby('preds').mean()

  df_stats.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.964644
COUNTRY,1.0
IBAN,1.0
NAME,0.724097
ORG,0.831514
