In [1]:
from fastText.FastText import train_supervised, fasttext, load_model

In [2]:
import numpy as np
import pandas as pd
import re
import subprocess

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from scipy.sparse import hstack
from sklearn.metrics import roc_auc_score, classification_report
from datetime import datetime

In [13]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('data/train_preprocessed.csv').fillna(' ')
test = pd.read_csv('data/test_preprocessed.csv').fillna(' ')

tr_ids = train[['id']]
train[class_names] = train[class_names].astype(np.int8)
target = train[class_names]

In [14]:
def get_probs(rez):
    probs = []
    for r, prob in zip(rez[0], rez[1]):
        if r[0][-1] == '1':
            probs.append(prob[0])
        else:
            probs.append(1 - prob[0])
    return probs

In [15]:
def training(train_data, train_indices, val_indices, target, test_data):
    
    df_train = train_data.loc[train_indices]
    df_val = train_data.loc[val_indices]
    
    df_train[target + '_ft'] = df_train[target].apply(lambda x: '__label__1 ' if x == 1 else '__label__0 ')
    df_train[[target + '_ft', 'comment_text']].to_csv('train_fastText.csv', index=False, header=False)
    
    d = subprocess.Popen("/home/ladmin/fastText-0.1.0/fasttext supervised -input /home/ladmin/toxic_comments/train_fastText.csv -output /home/ladmin/toxic_comments/fasttext_model -pretrainedVectors /home/ladmin/toxic_comments/embeddings/crawl-300d-2M.vec -loss hs -minCount 5 -dim 300".split())
    d.communicate()
    classifier = load_model('fasttext_model.bin')

    
    val_proba = np.array(get_probs(classifier.predict(list(df_val['comment_text']))))
    sub_proba = np.array(get_probs(classifier.predict(list(test_data['comment_text']))))
    
#     train_score = roc_auc_score(df_train[target], train_proba)
    val_score = roc_auc_score(df_val[target], val_proba)
    
    return val_score, val_proba, sub_proba, val_indices

In [16]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [17]:
predictors = 5
scores = []
for i, class_name in enumerate(class_names):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=42)
    
    train_scores, val_scores = [], []
    for train_indices, val_indices in kf.split(train):
        val_score, val_proba, sub_proba, val_indices = training(train, train_indices, val_indices, class_name, test)

        val_scores.append(val_score)

        train_probas[val_indices] += val_proba
        sub_probas += sub_proba / predictors
    
        scores.append(np.mean(val_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

Class: toxic
	Val ROC-AUC: 0.9735575055198786
Class: severe_toxic
	Val ROC-AUC: 0.9823391797615162
Class: obscene
	Val ROC-AUC: 0.9807298632133372
Class: threat
	Val ROC-AUC: 0.9839737348846651
Class: insult
	Val ROC-AUC: 0.973695169772277
Class: identity_hate
	Val ROC-AUC: 0.9733915322919167
Total: 0.9781052488012031


In [18]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999887,0.157253,0.998148,0.052779,0.880165,0.152029
1,0000247867823ef7,7e-05,0.000168,0.000101,-6e-06,0.000558,0.000376
2,00013b17ad220c46,2e-06,-7e-06,-6e-06,-1e-05,7e-06,-8e-06
3,00017563c3f7919a,0.000682,0.00042,0.002082,0.000128,0.002113,0.000134
4,00017695ad8997eb,0.012161,0.000313,0.000245,6.7e-05,0.000747,1.2e-05


In [19]:
submission.to_csv('data/submission_fasttext.csv', index=False)
train_submission.to_csv('data/train_fasttext.csv', index=False)