<a href="https://colab.research.google.com/github/AnnaZhuravleva/compling/blob/master/assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4: Named entity recognition

Create a model for Named Entity Recognition for dataset CoNLL 2002.  
Your quality metric = f1_macro

In your solution you should use: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost)   
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 

More baselines you beat - better your score
 
baseline 1 [3 points]: 0.0604      random labels  
baseline 2 [5 points]: 0.3966      PoS features + logistic regression  
baseline 3 [8 points]: 0.8122      word2vec cbow embedding + baseline 2 + svm    

[1 point] using feature engineering (creating features not presented in the baselines)

! Your results must be reproducible. You should explicitly set all seeds random_states in yout model.  
! Remember to use proper training pipeline.  

bonus, think about:  
1. [1 point] Why did we select f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV as GSCV

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
from google.colab import drive
import sys
drive.mount('/content/drive')
project_path = '/content/drive/My Drive/Colab Notebooks/compling'
sys.path.append(project_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/compling/ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [4]:
# number of sentences
df.sentence_idx.max()

1500.0

In [5]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [0]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').sentence_idx.count()
df = tdf.reset_index(drop=False)

In [7]:
df

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,Thousands,O,48
1,1.0,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,of,O,48
2,1.0,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,demonstrators,O,48
3,1.0,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,have,O,48
4,1.0,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,marched,O,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66869,1500.0,NN,back,JJ,serious,DT,IN,VBN,hospitalized,for,a,O,30
66870,1500.0,NN,injury,NN,back,JJ,DT,IN,for,a,serious,O,30
66871,1500.0,.,.,NN,injury,NN,JJ,DT,a,serious,back,O,30
66872,1500.0,__END1__,__END1__,.,.,NN,NN,JJ,serious,back,injury,O,30


In [0]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [9]:
df.head(30)

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48
5,1.0,28,to,16,London,9,32,33,have,marched,through,O,48
6,1.0,29,protest,28,to,16,9,32,marched,through,London,B-geo,48
7,1.0,7,the,29,protest,28,16,9,through,London,to,O,48
8,1.0,15,war,7,the,29,28,16,London,to,protest,O,48
9,1.0,9,in,15,war,7,29,28,to,protest,the,O,48


In [10]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [0]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
  

In [12]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 48.3 s, sys: 472 ms, total: 48.8 s
Wall time: 27.2 s


In [13]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
   # ('est', DummyClassifier(random_state=SEED))
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.7425836587782345
test 0.5863944082579811
CPU times: user 2.54 s, sys: 16 ms, total: 2.56 s
Wall time: 2.57 s


just try to use catboost

In [14]:
!pip install catboost
from catboost import CatBoostClassifier
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']
model = CatBoostClassifier()
model.fit(df_train[columns], y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

0:	learn: 2.4460573	total: 162ms	remaining: 2m 41s
1:	learn: 2.2030895	total: 269ms	remaining: 2m 14s
2:	learn: 2.0229944	total: 384ms	remaining: 2m 7s
3:	learn: 1.8784822	total: 490ms	remaining: 2m 2s
4:	learn: 1.7702973	total: 601ms	remaining: 1m 59s
5:	learn: 1.6688457	total: 712ms	remaining: 1m 58s
6:	learn: 1.5809830	total: 823ms	remaining: 1m 56s
7:	learn: 1.5035829	total: 932ms	remaining: 1m 55s
8:	learn: 1.4357570	total: 1.04s	remaining: 1m 54s
9:	learn: 1.3714008	total: 1.15s	remaining: 1m 54s
10:	learn: 1.3148094	total: 1.27s	remaining: 1m 54s
11:	learn: 1.2620485	total: 1.38s	remaining: 1m 53s
12:	learn: 1.2137771	total: 1.49s	remaining: 1m 52s
13:	learn: 1.1694000	total: 1.59s	remaining: 1m 52s
14:	learn: 1.1300471	total: 1.7s	remaining: 1m 51s
15:	learn: 1.0951468	total: 1.81s	remaining: 1m 51s
16:	learn: 1.0594521	total: 1.92s	remaining: 1m 51s
17:	learn: 1.0269822	total: 2.03s	remaining: 1m 50s
18:	learn: 0.9960648	total: 2.14s	remaining: 1m 50s
19:	learn: 0.9666449	tota

just try to use XGBClassifier

In [15]:
import xgboost
from xgboost import XGBClassifier as xgb

columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']
model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',  xgb())
])

model.fit(df_train[columns], y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.39326124218078107
test 0.3353187957066505


In [16]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
    #('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
    #                         penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('score to beat - 0.3966')
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

score to beat - 0.3966
train 0.7425836587782345
test 0.5863944082579811
CPU times: user 2.54 s, sys: 11 ms, total: 2.56 s
Wall time: 2.56 s


In [17]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

# model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
#                                    {'C': np.logspace(-4, 0, 5)}, 
#                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)

model = Pipeline([
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
])
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

train 0.9884754844600314
test 0.8206240392482764
CPU times: user 1min 11s, sys: 656 ms, total: 1min 11s
Wall time: 1min 11s


In [0]:
#columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']
#model = Pipeline([('est', xgb())])

#model.fit(X_train, y_train)
#print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
#print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

*try to create new feature 'upper_case'*

In [0]:
test_df = df

# add label 'upper_case' wheter word starts with upper case letter

test_df['upper_case'] = [1 if item[0].isupper() else 0 for item in test_df['word']]

In [20]:
# encode categorial variables

le = LabelEncoder()
test_df['pos'] = le.fit_transform(df.pos)
test_df['next-pos'] = le.fit_transform(df['next-pos'])
test_df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
test_df['prev-pos'] = le.fit_transform(df['prev-pos'])
test_df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])
test_df['upper_case'] = le.fit_transform(df['upper_case'])
# splitting
test_y = LabelEncoder().fit_transform(test_df.tag)

test_df_train, test_df_test, test_y_train, test_y_test = model_selection.train_test_split(test_df, test_y, stratify=test_y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [21]:
test_df

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length,upper_case
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48,1
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48,0
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48,0
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48,0
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66869,1500.0,15,back,10,serious,7,9,32,hospitalized,for,a,O,30,0
66870,1500.0,15,injury,15,back,10,7,9,for,a,serious,O,30,0
66871,1500.0,2,.,15,injury,15,10,7,a,serious,back,O,30,0
66872,1500.0,39,__END1__,2,.,15,15,10,serious,back,injury,O,30,0


In [22]:
%%time

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
test_X_train = sp.hstack([
    embeding.transform(test_df_train.word),
    embeding.transform(test_df_train['next-word']),
    embeding.transform(test_df_train['next-next-word']),
    embeding.transform(test_df_train['prev-word']),
    embeding.transform(test_df_train['prev-prev-word']),
    encoder_pos.fit_transform(test_df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos', 'upper_case']])
])
test_X_test = sp.hstack([
    embeding.transform(test_df_test.word),
    embeding.transform(test_df_test['next-word']),
    embeding.transform(test_df_test['next-next-word']),
    embeding.transform(test_df_test['prev-word']),
    embeding.transform(test_df_test['prev-prev-word']),
    encoder_pos.transform(test_df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos', 'upper_case']])
])

model = Pipeline([
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
])
model.fit(test_X_train, test_y_train)

print('train', metrics.f1_score(test_y_train, model.predict(test_X_train), average='macro'))
print('test', metrics.f1_score(test_y_test, model.predict(test_X_test), average='macro'))

train 0.9892644329152468
test 0.8404846501913203
CPU times: user 1min 1s, sys: 344 ms, total: 1min 1s
Wall time: 1min 1s
