## Loading results from each model

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import helpers
import matplotlib.pyplot as plt 

# Loading FastText predictions
fts0 = pd.read_csv('stage2data/pred_train_neg_u_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns ={'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
fts1 = pd.read_csv('stage2data/pred_train_pos_u_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
fts = pd.concat([fts0, fts1])
fts.reset_index(inplace = True)

# Load RoBERTa predictions
roberta = pd.read_csv('stage2data/pred_train_full_u_RoBERTa.csv').rename(columns = {'Real': 'y', 'Logit_zero': 'roberta_logit0','Logit_one': 'roberta_logit1' })

# Merge both
scores = pd.merge(roberta, fts, left_index = True, right_index = True).drop(columns = 'index')

# Load TFIDF predictions
tfidf0 = pd.read_csv('stage2data/pred_train_neg_u_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
tfidf1 = pd.read_csv('stage2data/pred_train_pos_u_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
tfidf = pd.concat([tfidf0, tfidf1])
tfidf.reset_index(inplace = True)

# Merge with others
scores = pd.merge(scores, tfidf, left_index = True, right_index = True).drop(columns = 'index')

# Load GloVe predictions
glove = pd.read_csv('stage2data/pred_train_full_u_GloVe.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'prediction': 'glove_p'})

# Merge with othera
scores = pd.merge(scores, glove, left_index = True, right_index = True)
scores

Unnamed: 0,y,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,0,2.024000,-2.216630,0.948165,0,-0.972410,0.051527
1,0,1.847972,-2.043474,0.622469,0,-0.146072,0.032494
2,0,1.366330,-1.485626,0.760661,0,-0.208908,0.183120
3,0,1.409886,-1.536457,0.585111,0,0.046071,0.254687
4,0,3.839225,-3.837476,0.904661,0,-0.387037,0.091759
...,...,...,...,...,...,...,...
2270477,1,-2.177680,2.324035,0.980886,1,0.961604,0.935704
2270478,1,-0.288089,0.842729,0.615098,1,0.281498,0.296435
2270479,1,-2.889905,3.036457,0.968866,1,0.739848,0.989794
2270480,1,-1.997405,2.185867,0.831153,1,0.249147,0.329838


In [2]:
# Same for test set
fts_t = pd.read_csv('stage2data/pred_test_supervisedfasttext.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'confidence': 'fts_confidence', 'predicted_label': 'fts_label'})
roberta_t = pd.read_csv('stage2data/pred_test_RoBERTa.csv').drop(columns = 'Idx').rename(columns = {'Real': 'y', 'Logit_zero': 'roberta_logit0','Logit_one': 'roberta_logit1' })
scores_t = pd.merge(roberta_t, fts_t, left_index = True, right_index = True)
tfidf_t = pd.read_csv('stage2data/pred_test_tfidf.csv').drop(columns = ['Unnamed: 0']).rename(columns = {'Decision_function': 'tfidf_decision'})
scores_t = pd.merge(scores_t, tfidf_t, left_index = True, right_index = True)
glove_t = pd.read_csv('stage2data/pred_test_GloVe.csv').drop(columns = 'Unnamed: 0').rename(columns = {'prediction': 'glove_p'})
scores_t = pd.merge(scores_t, glove_t, left_index = True, right_index = True)
scores_t.head(3)

Unnamed: 0,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,4.06247,-3.951884,0.993522,0,-1.313326,0.000892
1,1.979362,-2.170011,0.585111,0,-0.067237,0.380861
2,0.259373,0.106384,0.822199,0,-0.515623,0.043354


## Learning an optimal voting strategy

In [3]:
msk = np.random.rand(len(scores)) < 0.85

scores.reset_index(inplace = True) # Keep track of indices
train = scores[msk]
val = scores[~msk]

mod = smf.logit(formula='y ~ roberta_logit0 * roberta_logit1 * fts_confidence * fts_label * tfidf_decision * glove_p', data=train)
res = mod.fit(maxiter = 100)


y = val.y
predictions_roberta = val[['roberta_logit0', 'roberta_logit1']].values.argmax(axis = 1)
predictions_fts = val.fts_label
predictions_tfidf = (val.tfidf_decision) > 0
predictions_glove = (val.glove_p) > 0.5
predictions_statsmodels = (res.predict(val) > 0.5)

print("Accuracy with each model & voting:")
print("FastText: {:.3f}%".format(100*(predictions_fts == y).mean()))
print("RoBERTa: {:.3f}%".format(100*(predictions_roberta == y).mean()))
print("TFIDF + LinearSVC: {:.3f}%".format(100*(predictions_tfidf == y).mean()))
print("GloVe + LTSM: {:.3f}%".format(100*(predictions_glove == y).mean()))
print("Voting with statsmodels: {:.3f}%".format(100*(predictions_statsmodels == y).mean()))

         Current function value: 0.208004
         Iterations: 100




Accuracy with each model & voting:
FastText: 86.393%
RoBERTa: 91.345%
TFIDF + LinearSVC: 84.908%
GloVe + LTSM: 86.224%
Voting with statsmodels: 91.577%


In [5]:
# Redo with full dataset to save
save = True
if save:
    mod = smf.logit(formula='y ~ roberta_logit0 * roberta_logit1 * fts_confidence * fts_label * tfidf_decision * glove_p', data=scores)
    res = mod.fit(maxiter = 100)
    logistic_prediction = (res.predict(scores_t) > 0.5).astype(int).values
    helpers.save_pred('submissions/stage_2_statsmodels.csv', logistic_prediction)

         Current function value: 0.207934
         Iterations: 100




NameError: name 'opt_t' is not defined

## With more elaborate classifiers using scikit-learn

In [11]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = False, C = 1e9)
svc.fit(train.drop(columns = ['y', 'index']), train.y)
print((svc.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((svc.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9131779187624015
0.9130924876756678


In [12]:
from sklearn import linear_model

sgd_svm = linear_model.SGDClassifier(loss = 'hinge', max_iter=int(1e7), tol=1e-5, verbose = False)

sgd_svm.fit(train.drop(columns = ['y', 'index']), train.y)
print((sgd_svm.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((sgd_svm.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9135373725790604
0.9134898094327129


In [6]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(max_depth = 5, n_estimators = 40)

GBC.fit(train.drop(columns = ['y', 'index']), train.y)
print((GBC.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((GBC.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9158068137876412
0.9159331450204526


In [15]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators = 10)

RFC.fit(train.drop(columns = ['y', 'index']), train.y)
print((RFC.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((RFC.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9924110701416631
0.9060525347656537


In [18]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(hidden_layer_sizes = (10,))
MLP.fit(train.drop(columns = ['y', 'index']), train.y)
print((MLP.predict(train.drop(columns = ['y', 'index']))== train.y).mean())
print((MLP.predict(val.drop(columns = ['y', 'index'])) == val.y).mean())

0.9156961672589368
0.9156412331690089


### Accuracy of each classifier

In [11]:
clf = GBC

y_val = val.y
predictions_roberta_val = val[['roberta_logit0', 'roberta_logit1']].values.argmax(axis = 1)
predictions_fts_val = val.fts_label
predictions_tfidf_val = (val.tfidf_decision) > 0
predictions_glove_val = (val.glove_p) > 0.5
stage_2_val = clf.predict(val.drop(columns = ['y', 'index']))

val_results = pd.DataFrame(list(zip(y_val, predictions_roberta_val, predictions_fts_val, predictions_tfidf_val, predictions_glove_val, stage_2_val)), 
                           columns = ['Ground truth', 'RoBERTa', 'FastText', 'TFIDF', 'GloVe', 'stage2'])

print('Validation accuracy: ')
for col in val_results.columns:
    if col != 'Ground truth':
        print(col.ljust(25) +'{:.3f}%'.format(100*(val_results[col] == val_results['Ground truth']).mean()))

Validation accuracy: 
RoBERTa                  91.345%
FastText                 86.393%
TFIDF                    84.908%
GloVe                    86.224%
stage2                   91.593%


### How many times does the stage 2 classifier agree with each model ?

In [14]:
gt = y_val == stage_2_val
ft = predictions_fts_val == stage_2_val
rb = predictions_roberta_val == stage_2_val
tf = predictions_tfidf_val == stage_2_val
gl = predictions_glove_val == stage_2_val

agree = pd.DataFrame(list(zip(gt, ft, rb, tf, gl, val.index.values)), columns = ['Correct', 'FastText', 'RoBERTa', 'TFIDF', 'GloVe', 'index'])
print('Agreement with stage2: ')
print(agree.drop(columns = ['Correct', 'index']).mean()*100)
agree

Agreement with stage2: 
FastText    89.890294
RoBERTa     99.047399
TFIDF       88.172037
GloVe       89.562700
dtype: float64


Unnamed: 0,Correct,FastText,RoBERTa,TFIDF,GloVe,index
0,True,True,True,True,True,0
1,True,True,True,True,True,1
2,True,False,True,False,False,5
3,True,True,True,True,True,6
4,True,False,True,False,False,11
...,...,...,...,...,...,...
341271,True,True,True,True,False,2270457
341272,True,True,True,True,True,2270463
341273,True,True,True,True,False,2270465
341274,True,True,True,True,True,2270479


### When the stage 2 classifier is wrong, which models did it agree with ?

In [15]:
wrong = agree.query('Correct == False').drop(columns = 'Correct')
wrong

Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
15,False,True,False,True,78
24,True,True,True,True,150
26,True,True,True,True,169
52,True,True,True,True,375
59,True,True,True,True,424
...,...,...,...,...,...
341233,False,True,True,True,2270168
341239,False,True,False,False,2270216
341251,True,True,True,True,2270283
341256,True,True,True,True,2270336


### How often were all the models wrong ? 

In [19]:
wrong_all = agree.query('Correct == False').drop(columns = 'Correct').query('FastText & RoBERTa & TFIDF & GloVe')
wrong_all_indices = wrong_all['index'].values
print("All models agree on the wrong answer in {:.2f}% of misclassified cases!".format(100*len(wrong_all)/len(wrong)))
wrong_all

All models agree on the wrong answer in 50.17% of misclassified cases!


Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
24,True,True,True,True,150
26,True,True,True,True,169
52,True,True,True,True,375
59,True,True,True,True,424
69,True,True,True,True,472
...,...,...,...,...,...
341033,True,True,True,True,2268769
341126,True,True,True,True,2269347
341207,True,True,True,True,2269999
341251,True,True,True,True,2270283


### What does a tweet that can fool 4 models look like ?

In [23]:
neg = np.array(helpers.txt_to_list('stage2data/train_neg_full_u.txt'))
pos = np.array(helpers.txt_to_list('stage2data/train_pos_full_u.txt'))
test_tweets = np.concatenate((neg, pos))
tricky_tweets = pd.DataFrame(list(zip(scores.y.values[wrong_all_indices], test_tweets[wrong_all_indices])), columns = ['Label', 'Tweet'])
tricky_tweets.set_index('Tweet', inplace = True)
tricky_tweets.sample(25)

Unnamed: 0_level_0,Label
Tweet,Unnamed: 1_level_1
i wish rt <user> <user> awesome and sexy ?\n,0
<user> yeah i think so\n,0
my bad i'll take it back ! rt <user> <user> why would you steal my cake ! ! ! d:\n,1
<user> lmao yes ! im a fatass & youre a sociaaal freaak ! lol ~ . ~\n,0
<user> i'm sure it is very interesting ... i'll watch it a bit later ... too much noise in my office right now !\n,0
everyday everynight i am missing you :| you pushed me to that edge :| -> you're tattooed on my mind .. #wallpaper ehh ( lol\n,1
i won't b at the game today or travelling to london ... but have an amazing time to all my corrie friends love you lots xxx\n,0
it's better to be feared than to be loved\n,0
back in jefferson city * throws confetti *\n,0
"dear my crush , i still in love with you .\n",0


### Incorrect predictions without consensus

In [21]:
wrong_not_all = agree.query('Correct == False').drop(columns = 'Correct').query('not (FastText & RoBERTa & TFIDF & GloVe)')
wrong_not_all

Unnamed: 0,FastText,RoBERTa,TFIDF,GloVe,index
15,False,True,False,True,78
91,False,True,False,False,589
117,True,True,True,False,816
122,True,True,True,False,850
151,False,True,False,True,1059
...,...,...,...,...,...
341220,False,True,False,True,2270087
341225,False,True,False,False,2270106
341233,False,True,True,True,2270168
341239,False,True,False,False,2270216


### Predict on test set

In [24]:
scores_t

Unnamed: 0,roberta_logit0,roberta_logit1,fts_confidence,fts_label,tfidf_decision,glove_p
0,4.062470,-3.951884,0.993522,0,-1.313326,0.000892
1,1.979362,-2.170011,0.585111,0,-0.067237,0.380861
2,0.259373,0.106384,0.822199,0,-0.515623,0.043354
3,-1.432103,1.773372,0.843905,1,0.173155,0.634026
4,3.315032,-3.392591,0.965910,0,-0.550986,0.024156
...,...,...,...,...,...,...
9995,-3.274207,3.391324,0.952584,1,0.692784,0.935005
9996,1.178508,-1.217516,0.936295,0,-0.790469,0.172268
9997,4.053185,-3.959083,0.998876,0,-1.667938,0.000059
9998,-2.157137,2.280871,0.979678,1,0.821969,0.950965


In [26]:
GBC = GradientBoostingClassifier(max_depth = 5, n_estimators = 60)

GBC.fit(scores.drop(columns = ['y', 'index']), scores.y)
predictions = GBC.predict(scores_t).astype(int)
helpers.save_pred('submissions/stage_2_GBC.csv', predictions)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

### Trying with XGboost

In [159]:
import xgboost as xgb


X_train, y_train = train.drop(columns = ['y', 'index']), train.y
X_val, y_val = val.drop(columns = ['y', 'index']), val.y

xgbc = xgb.XGBClassifier(objective ='binary:hinge', booster = 'dart', colsample_bytree = 1, learning_rate = 0.3,
                max_depth = 8, alpha = 1, n_estimators = 50, use_label_encoder=False, tree_method = 'exact', 
                         num_parallel_tree = 3)

xgbc.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='error',
        verbose=True)

print(100*(xgbc.predict(X_val) == y_val).mean())

[0]	validation_0-error:0.50334	validation_1-error:0.50338
[1]	validation_0-error:0.15555	validation_1-error:0.15609
[2]	validation_0-error:0.10338	validation_1-error:0.10382
[3]	validation_0-error:0.09381	validation_1-error:0.09385
[4]	validation_0-error:0.09049	validation_1-error:0.09077
[5]	validation_0-error:0.08871	validation_1-error:0.08908
[6]	validation_0-error:0.08744	validation_1-error:0.08777
[7]	validation_0-error:0.08639	validation_1-error:0.08672
[8]	validation_0-error:0.08591	validation_1-error:0.08633
[9]	validation_0-error:0.08563	validation_1-error:0.08595
[10]	validation_0-error:0.08537	validation_1-error:0.08572
[11]	validation_0-error:0.08521	validation_1-error:0.08551
[12]	validation_0-error:0.08499	validation_1-error:0.08540
[13]	validation_0-error:0.08490	validation_1-error:0.08530
[14]	validation_0-error:0.08478	validation_1-error:0.08512
[15]	validation_0-error:0.08473	validation_1-error:0.08507
[16]	validation_0-error:0.08470	validation_1-error:0.08502
[17]	va

In [165]:
X, y = scores.drop(columns = ['y', 'index']), scores.y
random_idxs = np.random.permutation(len(y))
X = X.iloc[random_idxs]
y = y[random_idxs]
X_test = scores_t

In [166]:
xgbc = xgb.XGBClassifier(objective ='binary:hinge', booster = 'dart', colsample_bytree = 1, learning_rate = 0.3,
                max_depth = 10, alpha = 1, n_estimators = 50, use_label_encoder=False, tree_method = 'exact', 
                         num_parallel_tree = 4)

xgbc.fit(X, y,
        eval_set=[(X, y)],
        eval_metric='error',
        verbose=True)

print(100*(xgbc.predict(X) == y).mean())

[0]	validation_0-error:0.50335
[1]	validation_0-error:0.15487
[2]	validation_0-error:0.10253
[3]	validation_0-error:0.09291
[4]	validation_0-error:0.09005
[5]	validation_0-error:0.08732
[6]	validation_0-error:0.08616
[7]	validation_0-error:0.08542
[8]	validation_0-error:0.08502
[9]	validation_0-error:0.08479
[10]	validation_0-error:0.08455
[11]	validation_0-error:0.08429
[12]	validation_0-error:0.08416
[13]	validation_0-error:0.08407
[14]	validation_0-error:0.08398
[15]	validation_0-error:0.08389
[16]	validation_0-error:0.08383
[17]	validation_0-error:0.08375
[18]	validation_0-error:0.08373
[19]	validation_0-error:0.08369
[20]	validation_0-error:0.08367
[21]	validation_0-error:0.08364
[22]	validation_0-error:0.08363
[23]	validation_0-error:0.08359
[24]	validation_0-error:0.08356
[25]	validation_0-error:0.08352
[26]	validation_0-error:0.08351
[27]	validation_0-error:0.08349
[28]	validation_0-error:0.08349
[29]	validation_0-error:0.08345
[30]	validation_0-error:0.08344
[31]	validation_0-

In [167]:
predictions = xgbc.predict(X_test)
helpers.save_pred('submissions/stage_2_XGBoost.csv', predictions)

## Load simple data representaion

In [42]:
import numpy as np

model = 'skipgram' # or cbow
X1_0 = np.load('stage2data/'+model + '_train_neg_full_u.npy')
X1_1 = np.load('stage2data/'+model + '_train_pos_full_u.npy')
X1 = np.concatenate((X1_0, X1_1))
X2 = scores.drop(columns = ['y', 'index']).values
X = np.concatenate((X1, X2), axis = 1)
y = scores['y'].values

random_idxs = np.random.permutation(len(y))
X = X[random_idxs]
y = y[random_idxs].astype(int)

X_train, X_val, y_train, y_val = helpers.split_dataset(0.8, X, y)

In [48]:

clf = linear_model.SGDClassifier(loss = 'hinge', max_iter=int(1e7), tol=1e-7, verbose = False)
#clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = False, C = 1e9)

clf.fit(X_train, y_train)
print((clf.predict(X_train)== y_train).mean())
print((clf.predict(X_val)== y_val).mean())

0.9136361509261528
0.9129723385091731


In [64]:
clf.fit(X, y)
print((clf.predict(X)== y).mean())

0.9134042903665389


In [65]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators = 10)

RFC.fit(X_train, y_train)
print((RFC.predict(X_train)== y_train).mean())
print((RFC.predict(X_val)== y_val).mean())

0.9930807620631089
0.9072202635119809
