# Homework: Named entity recognition

Для заданной тестовой выборки построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.7559      word2vec cbow embedding + baseline 2 + svm    

Пока мы рассмотрели только линейные модели - поэтому в примерах есть только они. Желательно при решении домашнего задания пользоваться линейными моделями. Таким образом, основные цели задания - feature engineering, hyperparam tuning & model selection.

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей  
! Вы должны использовать df_test только для измерения качества конечной обученной модели. 

bonus, think about:  
1. how can you exploit that words belong to some sentence?
2. why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable   

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


### Baseline 4

In [83]:
from sklearn import warnings
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(n_values=15000, categorical_features='all')

def OHEncode(df):
    return OHE.fit_transform(np.array(df).reshape(-1, 1))

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc_pos = OHEncode(df_train['pos'])
    enc_next_pos = OHEncode(df_train['next-pos'])
    enc_next_next_pos = OHEncode(df_train['next-next-pos'])
    enc_prev_pos = OHEncode(df_train['prev-pos'])
    enc_prev_prev_pos = OHEncode(df_train['prev-prev-pos'])

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2,5), max_features = 2000)

enc_next_word = tfidf.fit_transform(df_train['next-word'])
enc_next_next_word = tfidf.fit_transform(df_train['next-next-word'])
enc_prev_word = tfidf.fit_transform(df_train['prev-word'])
enc_prev_prev_word = tfidf.fit_transform(df_train['prev-prev-word'])

In [97]:
import scipy.sparse as sp
X_train = sp.hstack((enc_pos, enc_next_pos, enc_next_next_pos,
                     enc_prev_pos, enc_prev_prev_pos, 
                    enc_next_word, enc_next_next_word,
                     enc_prev_word, enc_prev_prev_word))

In [98]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    enc_pos_test = OHEncode(df_test['pos'])
    enc_next_pos_test = OHEncode(df_test['next-pos'])
    enc_next_next_pos_test = OHEncode(df_test['next-next-pos'])
    enc_prev_pos_test = OHEncode(df_test['prev-pos'])
    enc_prev_prev_pos_test = OHEncode(df_test['prev-prev-pos'])

    enc_next_word_test = tfidf.transform(df_test['next-word'])
    enc_next_next_word_test = tfidf.transform(df_test['next-next-word'])
    enc_prev_word_test = tfidf.transform(df_test['prev-word'])
    enc_prev_prev_word_test = tfidf.transform(df_test['prev-prev-word'])
    
X_test = sp.hstack((enc_pos_test, enc_next_pos_test, enc_next_next_pos_test,
                     enc_prev_pos_test, enc_prev_prev_pos_test, 
                    enc_next_word_test, enc_next_next_word_test,
                     enc_prev_word_test, enc_prev_prev_word_test))

In [126]:
%%time
from sklearn.model_selection import GridSearchCV
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = GridSearchCV(SGDClassifier(), 
                       {'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
                       'fit_intercept' : (True, False), 
                       'shuffle' : (True, False)}, 
                        verbose=2)
    clf.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] fit_intercept=True, loss=hinge, shuffle=True ....................
[CV] ..... fit_intercept=True, loss=hinge, shuffle=True, total=   0.8s
[CV] fit_intercept=True, loss=hinge, shuffle=True ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ..... fit_intercept=True, loss=hinge, shuffle=True, total=   1.8s
[CV] fit_intercept=True, loss=hinge, shuffle=True ....................
[CV] ..... fit_intercept=True, loss=hinge, shuffle=True, total=   1.3s
[CV] fit_intercept=True, loss=hinge, shuffle=False ...................
[CV] .... fit_intercept=True, loss=hinge, shuffle=False, total=   1.0s
[CV] fit_intercept=True, loss=hinge, shuffle=False ...................
[CV] .... fit_intercept=True, loss=hinge, shuffle=False, total=   0.4s
[CV] fit_intercept=True, loss=hinge, shuffle=False ...................
[CV] .... fit_intercept=True, loss=hinge, shuffle=False, total=   0.5s
[CV] fit_intercept=True, loss=log, shuffle=True ......................
[CV] ....... fit_intercept=True, loss=log, shuffle=True, total=   3.1s
[CV] fit_intercept=True, loss=log, shuffle=True ......................
[CV] ....... fit_intercept=True, loss=log, shuffle=True, total=   2.6s
[CV] fit_intercept=True, loss=log, shuffle=True ......................
[CV] .

[CV]  fit_intercept=False, loss=perceptron, shuffle=False, total=   1.0s
[CV] fit_intercept=False, loss=perceptron, shuffle=False .............
[CV]  fit_intercept=False, loss=perceptron, shuffle=False, total=   1.0s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.9min finished


Wall time: 1min 58s


In [127]:
clf.best_params_

{'fit_intercept': False, 'loss': 'modified_huber', 'shuffle': True}

In [128]:
from sklearn.linear_model import SGDClassifier
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model = SGDClassifier(fit_intercept=False,
                         loss='modified_huber',
                         penalty='l2',
                         shuffle=True,
                         random_state=SEED)
    model.fit(X_train, y_train)
    print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
    print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

train 0.898040560492
test 0.353134454584
