# Random Forest NER

In [43]:
import random
from faker import Faker

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

## Generate data

In [8]:
LOCS = ['fr_FR', 'fr_FR', 'fr_FR', 'en_US', 'en_GB', 'de_DE', 'fr_CH', 'nl_BE', 'it_IT', 'es_ES']
fake = {loc:Faker(loc) for loc in LOCS}
Faker.seed(411)

DATASET_SIZE = 10000

In [9]:
adrs = []

for i in range(DATASET_SIZE):
    words, tags = [],[]
    loc = random.sample(LOCS,1)[0]
    
    for f in fake[loc].iban().split():
        if random.random()>0.2:
            words.append(f)
            tags.append('IBAN')
        
    if random.random() > 0.5:
        for f in fake[loc].name().split():
            words.append(f)
            tags.append('NAME')
    else:
        for f in fake[loc].company().split():
            words.append(f)
            tags.append('ORG')
        
    for f in fake[loc].address().split():
        if random.random()>0.1:
            words.append(f)
            tags.append('ADDRESS')
    
    if random.random()>0.1:
        words.append(loc[-2:])
        tags.append('COUNTRY')
        
    adrs.append((words,tags))

## Building features

In [24]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [29]:
data = []
for i,adr in enumerate(adrs):
    for ii,(text,tag) in enumerate(zip(adr[0],adr[1])):
        d = {
            "adr_idx":i,
            "tag":tag,
            "text":text,  
            "text_pos":ii,
            "adr_len": len(adr[0]),
            "pos_percent": ii/len(adr[0]),
            "islower":text.islower(),
            "isupper":text.isupper(),
            "istitle":text.istitle(),
            "isdigit":text.isdigit(),
            "hasDigit":has_numbers(text),

            "len":len(text)
        }
        data.append(d)
df = pd.DataFrame(data)
df

Unnamed: 0,adr_idx,tag,text,text_pos,adr_len,pos_percent,islower,isupper,istitle,isdigit,hasDigit,len
0,0,IBAN,FR6930539337630881315984590,0,8,0.000,False,True,False,False,True,27
1,0,ORG,Hoarau,1,8,0.125,False,False,True,False,False,6
2,0,ORG,S.A.S.,2,8,0.250,False,True,True,False,False,6
3,0,ADDRESS,rue,3,8,0.375,True,False,False,False,False,3
4,0,ADDRESS,Roussel,4,8,0.500,False,False,True,False,False,7
...,...,...,...,...,...,...,...,...,...,...,...,...
95290,9999,ADDRESS,Scheibestr.,3,8,0.375,False,False,True,False,False,11
95291,9999,ADDRESS,5/2,4,8,0.500,False,False,False,False,True,3
95292,9999,ADDRESS,86834,5,8,0.625,False,False,False,True,True,5
95293,9999,ADDRESS,Bischofswerda,6,8,0.750,False,False,True,False,False,13


## Baseline 

In [37]:
X = df[['text_pos', 'adr_len', 'pos_percent',
       'islower', 'isupper', 'istitle', 'isdigit', 
        'hasDigit', 'len']]
y = df['tag'].

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

In [99]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.8932892596673486  +/-  0.0019878348929571613


## Improving the perfs

The idea is to take in consideration previous and next tags to give some context by providing these info to a second random forest

In [100]:
clf.fit(X_train, y_train)
train_preds = clf.predict(X_train)
train_preds

array(['IBAN', 'NAME', 'ORG', ..., 'ADDRESS', 'ADDRESS', 'COUNTRY'],
      dtype=object)

In [101]:
X2_train = X_train.copy()
X2_train['pred_TAG'] = train_preds

X2_train["prev_tag"] = np.concatenate((["START"], train_preds[:-1]))
X2_train.loc[X2_train.text_pos==0,"prev_tag"] = "START"

X2_train["next_tag"] = np.concatenate((train_preds[1:], ["END"]))
X2_train.loc[X2_train.text_pos==X2_train.adr_len,"next_tag"] = "END"

X2_train = pd.concat((X2_train,pd.get_dummies(X2_train[["pred_TAG","prev_tag","next_tag"]])), axis=1)
X2_train = X2_train.drop(columns=["pred_TAG","prev_tag","next_tag"])

In [102]:
clf2 = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, X2_train, y_train, cv=5, scoring="accuracy")
print(np.mean(scores), " +/- ", np.std(scores))

0.9347158534825388  +/-  0.001793628855485006
