In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
data_path = Path('../data')
train = pd.read_csv(Path(data_path, 'normalized_train.csv')).fillna(' ')
test = pd.read_csv(Path(data_path, 'normalized_test.csv')).fillna(' ')

In [4]:
train_text = train['comment_text']
test_text = test['comment_text']

In [5]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
train_word_features = word_vectorizer.fit_transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

train_char_features = char_vectorizer.fit_transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])

In [6]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [7]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9703597080619248
CV score for class severe_toxic is 0.9876229933552323
CV score for class obscene is 0.9851192550435761
CV score for class threat is 0.9836483392224564
CV score for class insult is 0.9781945624763257
CV score for class identity_hate is 0.9747493675200651
Total CV score is 0.9799490376132635


In [8]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.985955,0.095065,0.966426,0.013071,0.882001,0.092371
1,0000247867823ef7,0.028643,0.005366,0.017578,0.002451,0.015573,0.006656
2,00013b17ad220c46,0.028094,0.005302,0.016329,0.002365,0.013184,0.005456
3,00017563c3f7919a,0.013536,0.003698,0.009811,0.002234,0.010473,0.0032
4,00017695ad8997eb,0.059908,0.004112,0.02097,0.002326,0.020582,0.004693


In [9]:
submission.to_csv('../data/lr_final_submission.csv', index=False)

In [10]:
automl_submission = pd.read_csv('../data/final_automl_submission.csv')

In [21]:
automl_submission['toxic'].describe()

count    153164.000000
mean          0.254687
std           0.215708
min           0.016145
25%           0.124368
50%           0.175262
75%           0.283230
max           1.000000
Name: toxic, dtype: float64

In [12]:
automl_lr_blend = submission.copy()

In [22]:
automl_lr_blend['toxic'].describe()

count    153164.000000
mean          0.167680
std           0.262946
min           0.000722
25%           0.018651
50%           0.043655
75%           0.163487
max           0.999999
Name: toxic, dtype: float64

In [19]:
automl_lr_blend['severe_toxic'] = automl_submission['severe_toxic']
automl_lr_blend['threat'] = automl_submission['threat']

In [23]:
automl_lr_blend.to_csv('../submission/automl_lr_blend_submission.csv', index=False)

In [24]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
