# Distinguishing comparison in sentences

## Data loading

In [1]:
import pandas as pd
from data_extraction import ExtractMiddlePart, ExtractRawSentence
from infersent.infersent_feature import initialize_infersent, InfersentFeature
from elmo.elmo_feature import initialize_elmo, ElmoFeature
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import pickle

In [2]:
train = pd.read_csv("data/data.csv")
test = pd.read_csv("data/held-out-data.csv")

## BOW + XGBoost

### Full sentences

In [3]:
pl = make_pipeline(ExtractRawSentence(), CountVectorizer(), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

In [4]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.643     0.553     0.594       273
       WORSE      0.491     0.235     0.318       119
        NONE      0.839     0.919     0.877      1048

    accuracy                          0.793      1440
   macro avg      0.658     0.569     0.597      1440
weighted avg      0.773     0.793     0.777      1440



### Middle part of the sentence

In [5]:
pl = make_pipeline(ExtractMiddlePart(), CountVectorizer(), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

In [6]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.765     0.751     0.758       273
       WORSE      0.542     0.328     0.408       119
        NONE      0.903     0.948     0.925      1048

    accuracy                          0.859      1440
   macro avg      0.736     0.675     0.697      1440
weighted avg      0.847     0.859     0.850      1440



## InferSent + XGBoost

### Full sentences

In [7]:
full_sentences = ExtractRawSentence().transform(train)

In [8]:
infersent = initialize_infersent(full_sentences)



Loaded
Found 14943(/16494) words with glove vectors
Vocab size : 14943


In [9]:
pl = make_pipeline(ExtractRawSentence(), InfersentFeature(infersent), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

  sentences[stidx:stidx + bsize]), volatile=True)


In [10]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.708     0.586     0.641       273
       WORSE      0.556     0.210     0.305       119
        NONE      0.849     0.947     0.895      1048

    accuracy                          0.817      1440
   macro avg      0.704     0.581     0.614      1440
weighted avg      0.798     0.817     0.798      1440



### Middle part of the sentence

In [11]:
middle_part = ExtractMiddlePart().transform(train)

In [12]:
infersent = initialize_infersent(middle_part)

Loaded
Found 6021(/6555) words with glove vectors
Vocab size : 6021


In [13]:
pl = make_pipeline(ExtractMiddlePart(), InfersentFeature(infersent), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

In [14]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.768     0.751     0.759       273
       WORSE      0.553     0.353     0.431       119
        NONE      0.901     0.943     0.921      1048

    accuracy                          0.858      1440
   macro avg      0.740     0.682     0.704      1440
weighted avg      0.847     0.858     0.850      1440



## ELMo

In [3]:
elmo = initialize_elmo()

Loaded


In [4]:
batch_size = 50

Preliminary test to define which version of ELMo is more suitable for our task (Original or Original5.5B)

### Full sentences

In [6]:
pl = make_pipeline(ExtractRawSentence(), ElmoFeature(elmo, batch_size), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

Original (5.5B)

In [7]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.622     0.495     0.551       273
       WORSE      0.609     0.118     0.197       119
        NONE      0.823     0.943     0.879      1048

    accuracy                          0.790      1440
   macro avg      0.685     0.518     0.542      1440
weighted avg      0.767     0.790     0.760      1440



Original

In [6]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.597     0.473     0.528       273
       WORSE      0.400     0.067     0.115       119
        NONE      0.822     0.945     0.879      1048

    accuracy                          0.783      1440
   macro avg      0.606     0.495     0.507      1440
weighted avg      0.745     0.783     0.749      1440



### Middle part of the sentence

In [8]:
pl = make_pipeline(ExtractMiddlePart(), ElmoFeature(elmo, batch_size), XGBClassifier(n_jobs=-1, n_estimators=1000))
fitted = pl.fit(train, train['most_frequent_label'].values)
predicted = fitted.predict(test)

Original (5.5B)

In [9]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.715     0.718     0.717       273
       WORSE      0.578     0.218     0.317       119
        NONE      0.888     0.949     0.917      1048

    accuracy                          0.845      1440
   macro avg      0.727     0.629     0.650      1440
weighted avg      0.829     0.845     0.830      1440



Original

In [6]:
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

              precision    recall  f1-score   support

      BETTER      0.736     0.685     0.710       273
       WORSE      0.558     0.244     0.339       119
        NONE      0.880     0.952     0.915      1048

    accuracy                          0.843      1440
   macro avg      0.725     0.627     0.655      1440
weighted avg      0.826     0.843     0.828      1440



### Creating ELMo embeddings of sentences for testing different classifiers

In [5]:
pl = make_pipeline(ExtractRawSentence(), ElmoFeature(elmo, batch_size))
pl = pl.fit(train, train['most_frequent_label'].values)
full_elmo_embs_train = pl.transform(train)
full_elmo_embs_test = pl.transform(test)

In [6]:
with open("elmo/full_train_features.pkl", "wb") as f:
    pickle.dump(full_elmo_embs_train, f)
with open("elmo/full_test_features.pkl", "wb") as f:
    pickle.dump(full_elmo_embs_test, f)

In [7]:
pl = make_pipeline(ExtractMiddlePart(), ElmoFeature(elmo, batch_size))
pl = pl.fit(train, train['most_frequent_label'].values)
mid_elmo_embs_train = pl.transform(train)
mid_elmo_embs_test = pl.transform(test)

In [8]:
with open("elmo/mid_train_features.pkl", "wb") as f:
    pickle.dump(mid_elmo_embs_train, f)
with open("elmo/mid_test_features.pkl", "wb") as f:
    pickle.dump(mid_elmo_embs_test, f)

### Tests with different classifiers

Results using only the middle part of the sentence are a lot better than the results with the use of the full sentences, so in further tests we will be using them.  
Also, the version of ELMo is Original (5.5B).

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

LogisticRegression

In [22]:
clf = LogisticRegression(solver='liblinear')
params = {
    'C': [10**i for i in range(-3, 4)],
    'penalty' : ['l1', 'l2']
}
gs = GridSearchCV(clf, params, cv=5, n_jobs=-1, scoring='f1_micro', verbose=1)
gs.fit(mid_elmo_embs_train, train['most_frequent_label'].values)

name = clf.__class__.__name__
f1_CV = gs.best_score_
predicted = gs.predict(mid_elmo_embs_test)
print(f'{name} scored on CV {round(f1_CV, 3)} for {gs.best_params_}')
print('Classification_report')
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   10.4s
[Parallel(n_jobs=5)]: Done  70 out of  70 | elapsed:  8.6min finished


LogisticRegression scored on CV 0.852 for {'C': 0.1, 'penalty': 'l2'}
Classification_report
              precision    recall  f1-score   support

      BETTER      0.746     0.766     0.756       273
       WORSE      0.600     0.328     0.424       119
        NONE      0.906     0.947     0.926      1048

    accuracy                          0.861      1440
   macro avg      0.751     0.680     0.702      1440
weighted avg      0.850     0.861     0.852      1440



KNeighborsClassifier

In [23]:
clf = KNeighborsClassifier()
params = {
    'n_neighbors': [3, 5, 10, 50, 100],
    'metric' : ['euclidean', 'minkowski']
}
gs = GridSearchCV(clf, params, cv=5, n_jobs=-1, scoring='f1_micro', verbose=1)
gs.fit(mid_elmo_embs_train, train['most_frequent_label'].values)

name = clf.__class__.__name__
f1_CV = gs.best_score_
predicted = gs.predict(mid_elmo_embs_test)
print(f'{name} scored on CV {round(f1_CV, 3)} for {gs.best_params_}')
print('Classification_report')
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  2.1min
[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed:  2.5min finished


KNeighborsClassifier scored on CV 0.816 for {'metric': 'euclidean', 'n_neighbors': 10}
Classification_report
              precision    recall  f1-score   support

      BETTER      0.617     0.707     0.659       273
       WORSE      0.429     0.076     0.129       119
        NONE      0.886     0.935     0.910      1048

    accuracy                          0.821      1440
   macro avg      0.644     0.573     0.566      1440
weighted avg      0.797     0.821     0.798      1440



RandomForestClassifier

In [25]:
clf = RandomForestClassifier()
params = {
    'n_estimators': [10, 100, 1000],
    'max_depth' : [3, 6, 10, 30]
}
gs = GridSearchCV(clf, params, cv=5, n_jobs=-1, scoring='f1_micro', verbose=1)
gs.fit(mid_elmo_embs_train, train['most_frequent_label'].values)

name = clf.__class__.__name__
f1_CV = gs.best_score_
predicted = gs.predict(mid_elmo_embs_test)
print(f'{name} scored on CV {round(f1_CV, 3)} for {gs.best_params_}')
print('Classification_report')
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  2.6min
[Parallel(n_jobs=5)]: Done  60 out of  60 | elapsed:  8.3min finished


RandomForestClassifier scored on CV 0.829 for {'max_depth': 30, 'n_estimators': 1000}
Classification_report
              precision    recall  f1-score   support

      BETTER      0.719     0.637     0.676       273
       WORSE      1.000     0.008     0.017       119
        NONE      0.852     0.973     0.909      1048

    accuracy                          0.830      1440
   macro avg      0.857     0.540     0.534      1440
weighted avg      0.839     0.830     0.791      1440



SVC

In [28]:
clf = SVC()
params = {
    'C': [10**i for i in range(-3, 4)],
    'kernel' : ['linear', 'rbf']
}
gs = GridSearchCV(clf, params, cv=5, n_jobs=-1, scoring='f1_micro', verbose=1)
gs.fit(mid_elmo_embs_train, train['most_frequent_label'].values)

name = clf.__class__.__name__
f1_CV = gs.best_score_
predicted = gs.predict(mid_elmo_embs_test)
print(f'{name} scored on CV {round(f1_CV, 3)} for {gs.best_params_}')
print('Classification_report')
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  3.3min
[Parallel(n_jobs=5)]: Done  70 out of  70 | elapsed:  5.5min finished


SVC scored on CV 0.858 for {'C': 10, 'kernel': 'rbf'}
Classification_report
              precision    recall  f1-score   support

      BETTER      0.751     0.740     0.745       273
       WORSE      0.617     0.244     0.349       119
        NONE      0.892     0.957     0.924      1048

    accuracy                          0.857      1440
   macro avg      0.753     0.647     0.673      1440
weighted avg      0.843     0.857     0.842      1440



XGBoost

In [None]:
clf = XGBClassifier()
params = {
    'n_estimators': [10, 100, 1000],
    'max_depth' : [3, 6, 10]
}
gs = GridSearchCV(clf, params, cv=5, n_jobs=-1, scoring='f1_micro', verbose=1)
gs.fit(mid_elmo_embs_train, train['most_frequent_label'].values)

name = clf.__class__.__name__
f1_CV = gs.best_score_
predicted = gs.predict(mid_elmo_embs_test)
print(f'{name} scored on CV {round(f1_CV, 3)} for {gs.best_params_}')
print('Classification_report')
print(classification_report(test['most_frequent_label'].values, predicted, labels=['BETTER', 'WORSE', 'NONE'], digits=3))