# Assignment 4: Named entity recognition

Create a model for Named Entity Recognition for dataset CoNLL 2002.  
Your quality metric = f1_macro

In your solution you should use: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost)   
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 

More baselines you beat - better your score
 
baseline 1 [3 points]: 0.0604      random labels  
baseline 2 [5 points]: 0.3966      PoS features + logistic regression  
baseline 3 [8 points]: 0.8122      word2vec cbow embedding + baseline 2 + svm    

[1 point] using feature engineering (creating features not presented in the baselines)

! Your results must be reproducible. You should explicitly set all seeds random_states in yout model.  
! Remember to use proper training pipeline.  

bonus, think about:  
1. [1 point] Why did we select f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV as GSCV
import sys

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').sentence_idx.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()


Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [9]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
  

In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 29.3 s, sys: 470 ms, total: 29.8 s
Wall time: 13.4 s


<__main__.Word2VecWrapper at 0x1a13877ef0>

In [11]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.7425836587782345
test 0.5863944082579811
CPU times: user 1.82 s, sys: 44.3 ms, total: 1.87 s
Wall time: 1.9 s


Catboost

In [12]:
!pip install catboost
from catboost import CatBoostClassifier
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']
model = CatBoostClassifier()
model.fit(df_train[columns], y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/c8/60/1ce20469e5352979b1c2d772d8da51f9bed326d1b760bc78548cb75ffbc6/catboost-0.20-cp37-none-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 1.1MB/s eta 0:00:01    |████████████████████▋           | 6.7MB 1.1MB/s eta 0:00:04     |███████████████████████████████▉| 10.3MB 1.1MB/s eta 0:00:01
[?25hCollecting plotly (from catboost)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/67/eb2b2be7a63a66548abea92447fc04d9abf363520f1af6145c5f033cd1b3/plotly-4.3.0-py2.py3-none-any.whl (7.3MB)
[K     |████████████████████████████████| 7.3MB 1.1MB/s eta 0:00:01     |███████████████████████████████ | 7.1MB 1.1MB/s eta 0:00:01
Collecting graphviz (from catboost)
  Downloading https://files.pythonhosted.org/packages/f5/74/dbed754c0abd63768d3a7a7b472da35b08ac442cf87d73d5850a6f32391e/graphviz-0.13.2-py2.py

105:	learn: 0.3352247	total: 6.67s	remaining: 56.2s
106:	learn: 0.3339983	total: 6.72s	remaining: 56.1s
107:	learn: 0.3324193	total: 6.77s	remaining: 55.9s
108:	learn: 0.3311886	total: 6.83s	remaining: 55.8s
109:	learn: 0.3297412	total: 6.88s	remaining: 55.7s
110:	learn: 0.3285898	total: 6.93s	remaining: 55.5s
111:	learn: 0.3272883	total: 6.98s	remaining: 55.4s
112:	learn: 0.3260593	total: 7.03s	remaining: 55.2s
113:	learn: 0.3252076	total: 7.08s	remaining: 55s
114:	learn: 0.3239592	total: 7.13s	remaining: 54.9s
115:	learn: 0.3226201	total: 7.18s	remaining: 54.8s
116:	learn: 0.3215783	total: 7.24s	remaining: 54.6s
117:	learn: 0.3207937	total: 7.29s	remaining: 54.5s
118:	learn: 0.3195034	total: 7.34s	remaining: 54.4s
119:	learn: 0.3182213	total: 7.4s	remaining: 54.3s
120:	learn: 0.3174058	total: 7.45s	remaining: 54.1s
121:	learn: 0.3165387	total: 7.5s	remaining: 54s
122:	learn: 0.3156057	total: 7.55s	remaining: 53.8s
123:	learn: 0.3145297	total: 7.61s	remaining: 53.7s
124:	learn: 0.3135

264:	learn: 0.2581214	total: 15.5s	remaining: 43.1s
265:	learn: 0.2579340	total: 15.6s	remaining: 43s
266:	learn: 0.2578052	total: 15.6s	remaining: 42.9s
267:	learn: 0.2577013	total: 15.7s	remaining: 42.8s
268:	learn: 0.2574881	total: 15.7s	remaining: 42.8s
269:	learn: 0.2573723	total: 15.8s	remaining: 42.7s
270:	learn: 0.2572381	total: 15.8s	remaining: 42.6s
271:	learn: 0.2570818	total: 15.9s	remaining: 42.5s
272:	learn: 0.2569402	total: 16s	remaining: 42.5s
273:	learn: 0.2567266	total: 16s	remaining: 42.5s
274:	learn: 0.2565959	total: 16.1s	remaining: 42.4s
275:	learn: 0.2564263	total: 16.1s	remaining: 42.3s
276:	learn: 0.2561981	total: 16.2s	remaining: 42.2s
277:	learn: 0.2560400	total: 16.2s	remaining: 42.1s
278:	learn: 0.2558828	total: 16.3s	remaining: 42.1s
279:	learn: 0.2557577	total: 16.3s	remaining: 42s
280:	learn: 0.2555692	total: 16.4s	remaining: 41.9s
281:	learn: 0.2554719	total: 16.4s	remaining: 41.8s
282:	learn: 0.2553270	total: 16.5s	remaining: 41.8s
283:	learn: 0.255103

423:	learn: 0.2375250	total: 24.3s	remaining: 33s
424:	learn: 0.2374166	total: 24.3s	remaining: 32.9s
425:	learn: 0.2372635	total: 24.4s	remaining: 32.9s
426:	learn: 0.2371447	total: 24.4s	remaining: 32.8s
427:	learn: 0.2370382	total: 24.5s	remaining: 32.7s
428:	learn: 0.2369409	total: 24.5s	remaining: 32.7s
429:	learn: 0.2368662	total: 24.6s	remaining: 32.6s
430:	learn: 0.2367979	total: 24.7s	remaining: 32.6s
431:	learn: 0.2367181	total: 24.7s	remaining: 32.5s
432:	learn: 0.2366376	total: 24.8s	remaining: 32.5s
433:	learn: 0.2365283	total: 24.9s	remaining: 32.4s
434:	learn: 0.2364510	total: 24.9s	remaining: 32.3s
435:	learn: 0.2363525	total: 25s	remaining: 32.3s
436:	learn: 0.2361893	total: 25s	remaining: 32.2s
437:	learn: 0.2360855	total: 25.1s	remaining: 32.2s
438:	learn: 0.2359621	total: 25.1s	remaining: 32.1s
439:	learn: 0.2358553	total: 25.2s	remaining: 32s
440:	learn: 0.2357892	total: 25.2s	remaining: 32s
441:	learn: 0.2356826	total: 25.3s	remaining: 31.9s
442:	learn: 0.2355900	

585:	learn: 0.2246585	total: 33.2s	remaining: 23.4s
586:	learn: 0.2245549	total: 33.3s	remaining: 23.4s
587:	learn: 0.2244236	total: 33.3s	remaining: 23.3s
588:	learn: 0.2243507	total: 33.4s	remaining: 23.3s
589:	learn: 0.2242800	total: 33.4s	remaining: 23.2s
590:	learn: 0.2242375	total: 33.5s	remaining: 23.2s
591:	learn: 0.2241746	total: 33.5s	remaining: 23.1s
592:	learn: 0.2240863	total: 33.6s	remaining: 23.1s
593:	learn: 0.2239987	total: 33.6s	remaining: 23s
594:	learn: 0.2239224	total: 33.7s	remaining: 22.9s
595:	learn: 0.2238543	total: 33.7s	remaining: 22.9s
596:	learn: 0.2237508	total: 33.8s	remaining: 22.8s
597:	learn: 0.2236525	total: 33.9s	remaining: 22.8s
598:	learn: 0.2235811	total: 33.9s	remaining: 22.7s
599:	learn: 0.2235290	total: 34s	remaining: 22.6s
600:	learn: 0.2234396	total: 34s	remaining: 22.6s
601:	learn: 0.2233444	total: 34.1s	remaining: 22.5s
602:	learn: 0.2232954	total: 34.1s	remaining: 22.5s
603:	learn: 0.2232303	total: 34.2s	remaining: 22.4s
604:	learn: 0.2231

744:	learn: 0.2149770	total: 41.3s	remaining: 14.1s
745:	learn: 0.2149524	total: 41.3s	remaining: 14.1s
746:	learn: 0.2148902	total: 41.4s	remaining: 14s
747:	learn: 0.2148346	total: 41.4s	remaining: 14s
748:	learn: 0.2147650	total: 41.5s	remaining: 13.9s
749:	learn: 0.2147333	total: 41.5s	remaining: 13.8s
750:	learn: 0.2147107	total: 41.6s	remaining: 13.8s
751:	learn: 0.2146284	total: 41.6s	remaining: 13.7s
752:	learn: 0.2145886	total: 41.7s	remaining: 13.7s
753:	learn: 0.2145294	total: 41.7s	remaining: 13.6s
754:	learn: 0.2144957	total: 41.8s	remaining: 13.6s
755:	learn: 0.2144284	total: 41.8s	remaining: 13.5s
756:	learn: 0.2143817	total: 41.8s	remaining: 13.4s
757:	learn: 0.2143156	total: 41.9s	remaining: 13.4s
758:	learn: 0.2142682	total: 41.9s	remaining: 13.3s
759:	learn: 0.2142126	total: 42s	remaining: 13.3s
760:	learn: 0.2141478	total: 42s	remaining: 13.2s
761:	learn: 0.2140727	total: 42.1s	remaining: 13.1s
762:	learn: 0.2140363	total: 42.1s	remaining: 13.1s
763:	learn: 0.213983

906:	learn: 0.2067036	total: 48.9s	remaining: 5.01s
907:	learn: 0.2066682	total: 49s	remaining: 4.96s
908:	learn: 0.2066294	total: 49s	remaining: 4.91s
909:	learn: 0.2066020	total: 49s	remaining: 4.85s
910:	learn: 0.2065318	total: 49.1s	remaining: 4.8s
911:	learn: 0.2064983	total: 49.2s	remaining: 4.74s
912:	learn: 0.2064500	total: 49.2s	remaining: 4.69s
913:	learn: 0.2064140	total: 49.3s	remaining: 4.63s
914:	learn: 0.2063856	total: 49.3s	remaining: 4.58s
915:	learn: 0.2063619	total: 49.3s	remaining: 4.53s
916:	learn: 0.2063121	total: 49.4s	remaining: 4.47s
917:	learn: 0.2062756	total: 49.5s	remaining: 4.42s
918:	learn: 0.2062376	total: 49.5s	remaining: 4.37s
919:	learn: 0.2062176	total: 49.6s	remaining: 4.31s
920:	learn: 0.2061523	total: 49.6s	remaining: 4.26s
921:	learn: 0.2061300	total: 49.7s	remaining: 4.2s
922:	learn: 0.2060711	total: 49.8s	remaining: 4.15s
923:	learn: 0.2060288	total: 49.8s	remaining: 4.1s
924:	learn: 0.2059906	total: 49.9s	remaining: 4.04s
925:	learn: 0.2059648

Xgboost

In [27]:
import xgboost
from xgboost import XGBClassifier as xgb

columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']
model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',  xgb())
])

model.fit(df_train[columns], y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.39326124218078107
test 0.3353187957066505


In [28]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
    #('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
    #                         penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.7425836587782345
test 0.5863944082579811
CPU times: user 1.9 s, sys: 47.3 ms, total: 1.95 s
Wall time: 1.99 s


In [29]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])


model = Pipeline([
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
])
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

train 0.987980880067051
test 0.8242161071155022
CPU times: user 49.5 s, sys: 1.98 s, total: 51.4 s
Wall time: 51.8 s
