In [147]:
from nltk import TweetTokenizer
import re
import pandas as pd
import numpy as np

In [148]:
import sent2vec
model = sent2vec.Sent2vecModel()


In [149]:
model.load_model('embeddings/twitter_unigrams.bin')

In [172]:
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')

In [173]:
train_prep = pd.read_csv('data/train_preprocessed.csv').fillna(' ')
test_prep = pd.read_csv('data/test_preprocessed.csv').fillna(' ')

In [174]:
train['comment_text'] = train_prep['comment_text']
test['comment_text'] = test['comment_text']

In [175]:
class_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [176]:
train['sent2vec_emb'] = train['comment_text'].map(lambda text: model.embed_sentence(text))

In [177]:
test['sent2vec_emb'] = test['comment_text'].map(lambda text: model.embed_sentence(text))

In [None]:
test.to_csv('data/test_s2v_emb.csv', index=False)
train.to_csv('data/train_s2v_emb.csv', index=False)

In [None]:
import telegram_send
telegram_send.send(['Sen2Vec saved'])

# Train LightGBM

In [178]:
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

In [180]:
lgm_params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
#               'verbosity': 1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 12,
              'lambda_l1': 1,
              'lambda_l2': 1}

rounds_lookup = {'toxic': 140,
             'severe_toxic': 50,
             'obscene': 80,
             'threat': 80,
             'insult': 70,
             'identity_hate': 80}

In [196]:
X_train, X_test, y_train, y_test = train_test_split(np.stack(train['sent2vec_emb'].values, axis=0), train['threat'], test_size=0.1, random_state=42)

In [197]:
classifier = lgb.LGBMClassifier(**lgm_params, n_estimators=rounds_lookup['threat'])

In [198]:
classifier.fit(X_train, y_train)

LGBMClassifier(application='binary', bagging_fraction=0.8,
        boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        data_random_seed=2, feature_fraction=0.6, lambda_l1=1, lambda_l2=1,
        learning_rate=0.2, max_depth=-1, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=80, n_jobs=-1, nthread=12, num_leaves=31,
        objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=1)

In [199]:
pred = classifier.predict_proba(X_test)[:,1]

In [200]:
roc_auc_score(y_test, pred)

0.9834147318601812

In [191]:
def training(train_indices, val_indices, class_name, params):   
    csr = np.stack(train['sent2vec_emb'].values, axis=0)
    X_train = csr[train_indices]
    y_train = np.array(train[class_name])[train_indices]
    
    X_test = csr[val_indices]
    y_test = np.array(train[class_name])[val_indices]
    
    classifier = lgb.LGBMClassifier(**lgm_params, n_estimators=params)
    classifier.fit(X_train, y_train)
    
    train_proba = classifier.predict_proba(X_train)[:, 1]
    val_proba = classifier.predict_proba(X_test)[:, 1]
    sub_proba = classifier.predict_proba(np.stack(test['sent2vec_emb'].values, axis=0))[:, 1]
    
    train_score = roc_auc_score(y_train, train_proba)
    val_score = roc_auc_score(y_test, val_proba)
    
    return train_score, val_score, val_proba, sub_proba, val_indices

In [192]:
predictors = 5

In [193]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [167]:
from tqdm import tqdm

predictors = 5
scores = []
for i, class_name in enumerate([class_names]):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=42)
    
    train_scores, val_scores = [], []
    for train_indices, val_indices in kf.split(train):
        train_score, val_score, val_proba, sub_proba, val_indices = training(train_indices, val_indices, class_name, rounds_lookup[class_name])

        val_scores.append(val_score)

        train_probas[val_indices] += val_proba
        sub_probas += sub_proba / predictors
    
        scores.append(np.mean(val_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

Class: toxic
	Val ROC-AUC: 0.9637572004981279
Class: severe_toxic
	Val ROC-AUC: 0.9813609426898129
Class: obscene
	Val ROC-AUC: 0.9734490591721849
Class: threat
	Val ROC-AUC: 0.5952066730214401
Class: insult
	Val ROC-AUC: 0.9703889429008807
Class: identity_hate
	Val ROC-AUC: 0.9744114660420191
Total: 0.9297635813510875


In [201]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999455,0.362346,0.98494,0.414119,0.977943,0.895989
1,0000247867823ef7,0.012253,0.00096,0.010627,0.400042,0.009683,0.00108
2,00013b17ad220c46,0.076606,0.005751,0.033204,0.400328,0.036823,0.008138
3,00017563c3f7919a,0.001581,0.000333,0.002956,0.400005,0.002386,4.1e-05
4,00017695ad8997eb,0.029466,0.001444,0.013087,0.400063,0.035649,0.000388


In [202]:
submission.to_csv('data/submission_sen2vec.csv', index=False)
train_submission.to_csv('data/train__sen2vec.csv', index=False)