In [48]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
data_path = Path('../data')
train = pd.read_csv(Path(data_path, 'normalized_train.csv')).fillna(' ')
test = pd.read_csv(Path(data_path, 'normalized_test.csv')).fillna(' ')

In [4]:
train_text = train['comment_text']
test_text = test['comment_text']

In [5]:
# using technique from https://arxiv.org/abs/1509.01626
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 5),
    max_features=500000)
train_word_features = word_vectorizer.fit_transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [None]:
y_true_labels = [list(row[2:]) for i, row in train.iterrows()]

In [27]:
y_true_labels = np.array(y_true_labels)

In [29]:
classifier = OneVsRestClassifier(LogisticRegression(C=0.1, solver='saga', multi_class='multinomial', random_state=31), n_jobs=-1)
classifier.fit(train_word_features, y_true_labels)

OneVsRestClassifier(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=31, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=-1)

In [34]:
test_preds = classifier.predict_proba(test_word_features)

In [35]:
train_preds = classifier.predict_proba(train_word_features)

In [49]:
scores = []
for i, class_name in enumerate(class_names):    
    roc_auc = roc_auc_score(train[class_name], train_preds[:,i])
    scores.append(roc_auc)

In [50]:
scores

[0.97465981354744,
 0.9880977317613263,
 0.9889599351903587,
 0.9908111494587243,
 0.9813393374180808,
 0.9819986888859459]

In [57]:
cv_scores = cross_val_score(classifier, train_word_features, y_true_labels, cv=5, scoring='roc_auc')

In [58]:
cv_scores

array([0.97566521, 0.97483714, 0.9750551 , 0.97462476, 0.97531918])

In [59]:
np.mean(cv_scores)

0.97510027806059

In [61]:
submission = pd.DataFrame({'id': test['id']})

In [63]:
for i, class_name in enumerate(class_names):
    submission[class_name] = test_preds[:, i]

In [65]:
submission.to_csv('../data/word_ngrams_tfidf_submission.csv', index=False)