In [15]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix, hstack
from scipy.special import logit, expit
from sklearn.model_selection import StratifiedKFold, KFold
import re, gc

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('datasets/train.csv').fillna(' ')
test = pd.read_csv('datasets/test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']

In [3]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
len(train)

159571

In [5]:
for class_name in class_names:
    print("{}\t{}\n".format(class_name, train[class_name].value_counts()))

toxic	0    144277
1     15294
Name: toxic, dtype: int64

severe_toxic	0    157976
1      1595
Name: severe_toxic, dtype: int64

obscene	0    151122
1      8449
Name: obscene, dtype: int64

threat	0    159093
1       478
Name: threat, dtype: int64

insult	0    151694
1      7877
Name: insult, dtype: int64

identity_hate	0    158166
1      1405
Name: identity_hate, dtype: int64



In [6]:
def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)
    
    text = text.lower()
    # Url cleaner
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    
    # English & punctuation regularization
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

In [7]:
train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))

for text in list_sentences_test:
    test_text.append(clean_word(text))
    
all_text = np.concatenate([train_text, test_text])
print("MSG : clean text process Done!")

MSG : clean text process Done!


In [8]:
del list_sentences_train, list_sentences_test
gc.collect()

178

In [9]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000
)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
print("MSG : word vectorization process Done!")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


MSG : word vectorization process Done!


In [10]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer=r'char',
    ngram_range=(1, 6),
    max_features=30000
)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
print("MSG : char vectorization process Done!")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


MSG : char vectorization process Done!


In [11]:
# release list memory
del train_text, test_text, all_text
gc.collect()

0

In [12]:
train_features = hstack([train_char_features, train_word_features]).tocsr()
test_features = hstack([test_char_features, test_word_features]).tocsr()

In [13]:
del train_char_features, train_word_features, test_char_features, test_word_features
gc.collect()

0

In [16]:
skfold = StratifiedKFold(n_splits=10, shuffle=False)
kfold = KFold(n_splits=10, shuffle=False)
models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_features)):
    print('{} fold:'.format(i))
    classifier = LogisticRegression(solver='sag', C=12.0)
    for class_name in class_names:
        print('...... Processing {}'.format(class_name))
        train_target = train[class_name][train_idx]
        classifier.fit(train_features[train_idx], train_target)
        y_pred = classifier.predict(train_features[train_idx])
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        val_target = train[class_name][test_idx]
        val_pred = classifier.predict(train_features[test_idx])
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
    models.append(classifier)
print("MSG : Done for k-fold cross validation!")

0 fold:
...... Processing toxic
Training accuracy is 0.9880999630952629
...... Processing severe_toxic




Training accuracy is 0.9973888157757306
...... Processing obscene
Training accuracy is 0.9951745315535502
...... Processing threat
Training accuracy is 0.9997353999986074
...... Processing insult
Training accuracy is 0.9920202210106327
...... Processing identity_hate
Training accuracy is 0.9984889947288894
1 fold:
...... Processing toxic
Training accuracy is 0.9881557508320916
...... Processing severe_toxic
Training accuracy is 0.9974654281615999
...... Processing obscene
Training accuracy is 0.9952790117955074
...... Processing threat
Training accuracy is 0.9997493280599384
...... Processing insult
Training accuracy is 0.9919088668235687
...... Processing identity_hate
Training accuracy is 0.9986561198768922
2 fold:
...... Processing toxic
Training accuracy is 0.9882741236926762
...... Processing severe_toxic
Training accuracy is 0.9973957970671383
...... Processing obscene
Training accuracy is 0.9952511593577228
...... Processing threat
Training accuracy is 0.9997493280599384
...... 

# Logistic Regression

In [17]:
losses = []
predictions = {'id': test['id']}
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    print("MSG : Finished predict the proba for {}".format(class_name))

MSG : Finished predict the proba for toxic
MSG : Finished predict the proba for severe_toxic
MSG : Finished predict the proba for obscene
MSG : Finished predict the proba for threat
MSG : Finished predict the proba for insult
MSG : Finished predict the proba for identity_hate


In [18]:
submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('results/Logistic-Submission.csv', index=False)

# ExtraTreeClassifier

In [19]:
losses = []
predictions = {'id': test['id']}
for class_name in class_names:
    train_target = train[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.9570610687741216
CV score for class severe_toxic is 0.9422976332912251
CV score for class obscene is 0.9804917156315561
CV score for class threat is 0.852109145437934
CV score for class insult is 0.9646194357605319
CV score for class identity_hate is 0.9135748399233806
