## Loading results from each model

In [8]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import sys, os
sys.path.append(os.path.abspath(os.path.join('..'))) 
import helpers
import matplotlib.pyplot as plt 

# Loading FastText predictions
fts0 = pd.read_csv('../stage2data/pred_train_neg_u_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns ={'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
fts1 = pd.read_csv('../stage2data/pred_train_pos_u_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
fts = pd.concat([fts0, fts1])
fts.reset_index(inplace = True)

# Load RoBERTa predictions
roberta = pd.read_csv('../stage2data/pred_train_full_u_RoBERTa.csv').rename(columns = {'Real': 'y', 'Logit_zero': 'roberta_logit0','Logit_one': 'roberta_logit1' })

# Merge both
scores = pd.merge(roberta, fts, left_index = True, right_index = True).drop(columns = 'index')

# Load TFIDF predictions
tfidf0 = pd.read_csv('../stage2data/pred_train_neg_u_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
tfidf1 = pd.read_csv('../stage2data/pred_train_pos_u_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
tfidf = pd.concat([tfidf0, tfidf1])
tfidf.reset_index(inplace = True)

# Merge with others
scores = pd.merge(scores, tfidf, left_index = True, right_index = True).drop(columns = 'index')

# Load GloVe predictions
glove = pd.read_csv('../stage2data/pred_train_full_u_GloVe.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'prediction': 'glove_p'})

# Merge with othera
scores = pd.merge(scores, glove, left_index = True, right_index = True)
scores.to_csv('../stage2data/pred_all_models_train.csv')
scores

Unnamed: 0,y,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,0,2.024000,-2.216630,0.948165,0,-1.220798,0.051527
1,0,1.847972,-2.043474,0.622469,0,-0.156154,0.032494
2,0,1.366330,-1.485626,0.760661,0,-0.431464,0.183120
3,0,1.409886,-1.536457,0.585111,0,-0.079840,0.254687
4,0,3.839225,-3.837476,0.904661,0,-0.583661,0.091759
...,...,...,...,...,...,...,...
2270477,1,-2.177680,2.324035,0.980886,1,1.084118,0.935704
2270478,1,-0.288089,0.842729,0.615098,1,0.279649,0.296435
2270479,1,-2.889905,3.036457,0.968866,1,0.842481,0.989794
2270480,1,-1.997405,2.185867,0.831153,1,0.335556,0.329838


In [9]:
# Same for test set
fts_t = pd.read_csv('../stage2data/pred_test_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
roberta_t = pd.read_csv('../stage2data/pred_test_RoBERTa.csv').drop(columns = 'Idx').rename(columns = {'Real': 'y', 'Logit_zero': 'roberta_logit0','Logit_one': 'roberta_logit1' })
scores_t = pd.merge(roberta_t, fts_t, left_index = True, right_index = True)
tfidf_t = pd.read_csv('../stage2data/pred_test_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
scores_t = pd.merge(scores_t, tfidf_t, left_index = True, right_index = True)
glove_t = pd.read_csv('../stage2data/pred_test_GloVe.csv').drop(columns = 'Unnamed: 0').rename(columns = {'prediction': 'glove_p'})
scores_t = pd.merge(scores_t, glove_t, left_index = True, right_index = True)
scores_t.to_csv('../stage2data/pred_all_models_test.csv')
scores_t.head(3)

Unnamed: 0,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,4.06247,-3.951884,0.993522,0,-1.515606,0.000892
1,1.979362,-2.170011,0.585111,0,-0.072648,0.380861
2,0.259373,0.106384,0.822199,0,-0.473114,0.043354


## Learning an optimal voting strategy

In [10]:
msk = np.random.rand(len(scores)) < 0.85

scores.reset_index(inplace = True) # Keep track of indices
train = scores[msk]
val = scores[~msk]

mod = smf.logit(formula = 'y ~ roberta_logit0 * roberta_logit1 * fts_confidence * fts_label * tfidf_decision * glove_p', data=train)
res = mod.fit(maxiter = 150)


y = val.y
predictions_roberta = val[['roberta_logit0', 'roberta_logit1']].values.argmax(axis = 1)
predictions_fts = val.fts_label
predictions_tfidf = (val.tfidf_decision) > 0
predictions_glove = (val.glove_p) > 0.5
predictions_statsmodels = (res.predict(val) > 0.5)

print("Accuracy with each model & voting:")
print("FastText: {:.3f}%".format(100*(predictions_fts == y).mean()))
print("RoBERTa: {:.3f}%".format(100*(predictions_roberta == y).mean()))
print("TFIDF + LinearSVC: {:.3f}%".format(100*(predictions_tfidf == y).mean()))
print("GloVe + LTSM: {:.3f}%".format(100*(predictions_glove == y).mean()))
print("Voting with statsmodels: {:.3f}%".format(100*(predictions_statsmodels == y).mean()))

Optimization terminated successfully.
         Current function value: 0.207841
         Iterations 54
Accuracy with each model & voting:
FastText: 86.438%
RoBERTa: 91.376%
TFIDF + LinearSVC: 86.787%
GloVe + LTSM: 86.256%
Voting with statsmodels: 91.621%


In [11]:
# Redo with full dataset to save
save = True
if save:
    mod = smf.logit(formula = 'y ~ roberta_logit0 * roberta_logit1 * fts_confidence * fts_label * tfidf_decision * glove_p', data=scores)
    res = mod.fit(maxiter = 150)
    logistic_prediction = (res.predict(scores_t) > 0.5).astype(int).values
    helpers.save_pred('../submissions/stage_2_statsmodels.csv', logistic_prediction)

Optimization terminated successfully.
         Current function value: 0.207691
         Iterations 115


## With more elaborate classifiers using scikit-learn

In [12]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = False, C = 1e9)
svc.fit(train.drop(columns = ['y', 'index']), train.y)
print((svc.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((svc.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9133303005087062
0.9140645191607341


In [13]:
from sklearn import linear_model

sgd_svm = linear_model.SGDClassifier(loss = 'hinge', max_iter=int(1e7), tol=1e-5, verbose = False)

sgd_svm.fit(train.drop(columns = ['y', 'index']), train.y)
print((sgd_svm.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((sgd_svm.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.913408539093904
0.9143112238904161


In [14]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(max_depth = 5, n_estimators = 40)

GBC.fit(train.drop(columns = ['y', 'index']), train.y)
print((GBC.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((GBC.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9159230546830716
0.9165668099903668


In [15]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators = 10)

RFC.fit(train.drop(columns = ['y', 'index']), train.y)
print((RFC.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((RFC.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9924854688667426
0.9068865863114118


In [16]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(hidden_layer_sizes = (10,))
MLP.fit(train.drop(columns = ['y', 'index']), train.y)
print((MLP.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((MLP.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9155577685733738
0.9164463945865934


### Accuracy of each classifier

In [17]:
clf = GBC

y_val = val.y
predictions_roberta_val = val[['roberta_logit0', 'roberta_logit1']].values.argmax(axis = 1)
predictions_fts_val = val.fts_label
predictions_tfidf_val = (val.tfidf_decision) > 0
predictions_glove_val = (val.glove_p) > 0.5
stage_2_val = clf.predict(val.drop(columns = ['y', 'index']))

val_results = pd.DataFrame(list(zip(y_val, predictions_roberta_val, predictions_fts_val, predictions_tfidf_val, predictions_glove_val, stage_2_val)), 
                           columns = ['Ground truth', 'RoBERTa', 'FastText', 'TFIDF', 'GloVe', 'stage2'])

print('Validation accuracy: ')
for col in val_results.columns:
    if col != 'Ground truth':
        print(col.ljust(25) +'{:.3f}%'.format(100*(val_results[col] == val_results['Ground truth']).mean()))

Validation accuracy: 
RoBERTa                  91.376%
FastText                 86.438%
TFIDF                    86.787%
GloVe                    86.256%
stage2                   91.657%


### How many times does the stage 2 classifier agree with each model ?

In [18]:
gt = y_val == stage_2_val
ft = predictions_fts_val == stage_2_val
rb = predictions_roberta_val == stage_2_val
tf = predictions_tfidf_val == stage_2_val
gl = predictions_glove_val == stage_2_val

agree = pd.DataFrame(list(zip(gt, ft, rb, tf, gl, val.index.values)), columns = ['Correct', 'FastText', 'RoBERTa', 'TFIDF', 'GloVe', 'index'])
print('Agreement with stage2: ')
print(agree.drop(columns = ['Correct', 'index']).mean()*100)
agree

Agreement with stage2: 
FastText    90.030192
RoBERTa     98.778812
TFIDF       90.311847
GloVe       89.650443
dtype: float64


Unnamed: 0,Correct,FastText,RoBERTa,TFIDF,GloVe,index
0,True,True,True,True,True,0
1,True,False,True,False,False,5
2,True,True,True,True,True,12
3,False,True,True,True,False,13
4,True,True,True,True,True,31
...,...,...,...,...,...,...
340483,True,True,True,True,True,2270447
340484,True,True,True,True,True,2270450
340485,True,True,True,True,True,2270461
340486,True,True,True,True,True,2270469


### When the stage 2 classifier is wrong, which models did it agree with ?

In [19]:
wrong = agree.query('Correct == False').drop(columns = 'Correct')
wrong

Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
3,True,True,True,False,13
6,True,True,True,True,41
12,True,True,True,False,77
66,False,True,False,False,383
71,True,True,True,True,395
...,...,...,...,...,...
340462,True,True,True,True,2270312
340465,False,True,True,False,2270344
340467,False,True,False,True,2270370
340469,False,True,False,False,2270382


### How often were all the models wrong ? 

In [20]:
wrong_all = agree.query('Correct == False').drop(columns = 'Correct').query('FastText & RoBERTa & TFIDF & GloVe')
wrong_all_indices = wrong_all['index'].values
print("All models agree on the wrong answer in {:.2f}% of misclassified cases!".format(100*len(wrong_all)/len(wrong)))
wrong_all

All models agree on the wrong answer in 50.67% of misclassified cases!


Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
6,True,True,True,True,41
71,True,True,True,True,395
86,True,True,True,True,497
89,True,True,True,True,515
97,True,True,True,True,565
...,...,...,...,...,...
340150,True,True,True,True,2268184
340232,True,True,True,True,2268769
340412,True,True,True,True,2269999
340433,True,True,True,True,2270154


### What does a tweet that can fool 4 models look like ?

In [21]:
neg = np.array(helpers.txt_to_list('../stage2data/train_neg_full_u.txt'))
pos = np.array(helpers.txt_to_list('../stage2data/train_pos_full_u.txt'))
test_tweets = np.concatenate((neg, pos))
tricky_tweets = pd.DataFrame(list(zip(scores.y.values[wrong_all_indices], test_tweets[wrong_all_indices])), columns = ['Label', 'Tweet'])
tricky_tweets.set_index('Tweet', inplace = True)
tricky_tweets.sample(25)

Unnamed: 0_level_0,Label
Tweet,Unnamed: 1_level_1
<user> hey babe guess what . your commming to michigannn today i never new . fuck you . ohmy . i love you . <3 <3 forever and ever .\n,0
"<user> i guess it's fake ? but i regret nothing actually , because i've met awesome directioners there <3\n",0
when my tl is filled with tweets from myself\n,0
im thaa bhadd guyy\n,0
sorry i had too this is how i feel today ! ! <url>\n,1
<user> lol . maybe . congratulations on making a new friend ! :d\n,0
i don't want a fuck buddy ... i just want a cuddle buddy ! someone to just wrap me in their arms all night #nobutseriously #waytomyheart\n,1
<user> haven't had one of those yet .. not a big fan of doritos .. and already regretting visiting tb ..\n,1
i cant believe i was telling anthony .. i trust him now\n,1
<user> thanks robyn i thought i was your friend too !\n,0


### Incorrect predictions without consensus

In [22]:
wrong_not_all = agree.query('Correct == False').drop(columns = 'Correct').query('not (FastText & RoBERTa & TFIDF & GloVe)')
wrong_not_all

Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
3,True,True,True,False,13
12,True,True,True,False,77
66,False,True,False,False,383
72,True,True,True,False,396
91,False,True,False,True,522
...,...,...,...,...,...
340450,False,True,False,True,2270252
340465,False,True,True,False,2270344
340467,False,True,False,True,2270370
340469,False,True,False,False,2270382


### Predict on test set

In [23]:
scores_t

Unnamed: 0,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,4.062470,-3.951884,0.993522,0,-1.515606,0.000892
1,1.979362,-2.170011,0.585111,0,-0.072648,0.380861
2,0.259373,0.106384,0.822199,0,-0.473114,0.043354
3,-1.432103,1.773372,0.843905,1,0.410372,0.634026
4,3.315032,-3.392591,0.965910,0,-0.738332,0.024156
...,...,...,...,...,...,...
9995,-3.274207,3.391324,0.952584,1,0.802763,0.935005
9996,1.178508,-1.217516,0.936295,0,-0.932303,0.172268
9997,4.053185,-3.959083,0.998876,0,-1.631948,0.000059
9998,-2.157137,2.280871,0.979678,1,0.928002,0.950965


In [24]:
GBC = GradientBoostingClassifier(max_depth = 5, n_estimators = 60)

GBC.fit(scores.drop(columns = ['y', 'index']), scores.y)
predictions = GBC.predict(scores_t).astype(int)
helpers.save_pred('../submissions/stage_2_GBC.csv', predictions)

### Trying with XGboost

In [53]:
import xgboost as xgb


X_train, y_train = train.drop(columns = ['y', 'index']), train.y
X_val, y_val = val.drop(columns = ['y', 'index']), val.y

xgbc = xgb.XGBClassifier(objective ='binary:hinge', booster = 'dart', colsample_bytree = 1, learning_rate = 0.3,
                max_depth = 10, alpha = 1, n_estimators = 20, use_label_encoder=False, tree_method = 'exact', 
                         num_parallel_tree = 2)

xgbc.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='error',
        verbose=True)

print(100*(xgbc.predict(X_val) == y_val).mean())

[0]	validation_0-error:0.50309	validation_1-error:0.50482
[1]	validation_0-error:0.15654	validation_1-error:0.15770
[2]	validation_0-error:0.10229	validation_1-error:0.10323
[3]	validation_0-error:0.09411	validation_1-error:0.09562
[4]	validation_0-error:0.08896	validation_1-error:0.09027
[5]	validation_0-error:0.08785	validation_1-error:0.08910
[6]	validation_0-error:0.08631	validation_1-error:0.08746
[7]	validation_0-error:0.08539	validation_1-error:0.08646
[8]	validation_0-error:0.08478	validation_1-error:0.08569
[9]	validation_0-error:0.08438	validation_1-error:0.08530
[10]	validation_0-error:0.08420	validation_1-error:0.08505
[11]	validation_0-error:0.08400	validation_1-error:0.08478
[12]	validation_0-error:0.08390	validation_1-error:0.08465
[13]	validation_0-error:0.08375	validation_1-error:0.08453
[14]	validation_0-error:0.08365	validation_1-error:0.08430
[15]	validation_0-error:0.08357	validation_1-error:0.08419
[16]	validation_0-error:0.08353	validation_1-error:0.08416
[17]	va

In [27]:
X, y = scores.drop(columns = ['y', 'index']), scores.y
random_idxs = np.random.permutation(len(y))
X = X.iloc[random_idxs]
y = y[random_idxs]
X_test = scores_t

In [55]:
xgbc = xgb.XGBClassifier(objective ='binary:hinge', booster = 'dart', colsample_bytree = 1, learning_rate = 0.3,
                max_depth = 10, alpha = 1, n_estimators = 50, use_label_encoder=False, tree_method = 'exact', 
                         num_parallel_tree = 4)

xgbc.fit(X, y,
        eval_set=[(X, y)],
        eval_metric='error',
        verbose=True)

print(100*(xgbc.predict(X) == y).mean())

[0]	validation_0-error:0.50335
[1]	validation_0-error:0.15500
[2]	validation_0-error:0.10298
[3]	validation_0-error:0.09395
[4]	validation_0-error:0.08944
[5]	validation_0-error:0.08734
[6]	validation_0-error:0.08613
[7]	validation_0-error:0.08517
[8]	validation_0-error:0.08471
[9]	validation_0-error:0.08438
[10]	validation_0-error:0.08406
[11]	validation_0-error:0.08388
[12]	validation_0-error:0.08379
[13]	validation_0-error:0.08359
[14]	validation_0-error:0.08346
[15]	validation_0-error:0.08343
[16]	validation_0-error:0.08339
[17]	validation_0-error:0.08335
[18]	validation_0-error:0.08334
[19]	validation_0-error:0.08331
[20]	validation_0-error:0.08327
[21]	validation_0-error:0.08323
[22]	validation_0-error:0.08320
[23]	validation_0-error:0.08317
[24]	validation_0-error:0.08318
[25]	validation_0-error:0.08318
[26]	validation_0-error:0.08314
[27]	validation_0-error:0.08312
[28]	validation_0-error:0.08311
[29]	validation_0-error:0.08309
[30]	validation_0-error:0.08307
[31]	validation_0-

In [57]:
predictions = xgbc.predict(X_test)
helpers.save_pred('../submissions/stage_2_XGBoost2.csv', predictions)