In [29]:
import warnings
warnings.filterwarnings('ignore')

import re
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
#labels
toxic_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#train dataset
train_set = pd.read_csv('train.csv')

#train dataset split
train_corpus = train_set.drop( toxic_columns, axis=1)
train_corpus = train_corpus.drop( ['id'], axis=1)
train_labels = train_set.drop( ['id','comment_text'], axis=1)

In [4]:
train_corpus['comment_text'].fillna(value='none', inplace=True)

In [5]:
train_corpus.head()

Unnamed: 0,comment_text
0,Explanation\nWhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ..."
4,"You, sir, are my hero. Any chance you remember..."


In [6]:
stop_words = set(stopwords.words('english'))
def normalize_text(comment):
    comment = comment.lower() 
    comment = re.sub(r"i'm", "i am ", comment)
    comment = re.sub(r"\'s", " ", comment)
    comment = re.sub(r"\'ve", " have ", comment)
    comment = re.sub(r"can't", "can not ", comment)
    comment = re.sub(r"n't", " not ", comment)
    comment = re.sub(r"\'ll", " will ", comment)
    comment = re.sub(r"\'re", " are ", comment)
    comment = re.sub(r"\'d", " would ", comment)
    comment = re.sub(r"what's", "what is ", comment)
    comment = re.sub(r"\'scuse", " excuse ", comment)
    comment = re.sub("[^a-z]", " ", comment)
    comment = comment.strip(' ')
    for word in stop_words: #removing stopwords
        token = " " + word + " "
        comment = comment.replace(token, " ")
        comment = comment.replace("  ", " ")
    return comment

In [7]:
train_corpus['comment_text'] = train_corpus['comment_text'].map(lambda el : normalize_text(el))

In [8]:
train_corpus['comment_text'][6]

'cocksucker piss around work'

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer( analyzer='word', ngram_range=(1, 1))
corpus = vectorizer.fit_transform(train_corpus['comment_text'])

In [10]:
corpus.toarray()[3]

array([0., 0., 0., ..., 0., 0., 0.])

In [11]:
vectorizer.inverse_transform(corpus.toarray()[3])

[array(['accidents', 'appears', 'article', 'articles', 'backlog', 'date',
        'delay', 'eg', 'else', 'etc', 'exact', 'first', 'form', 'format',
        'formatting', 'good', 'guess', 'ie', 'improvement', 'know',
        'later', 'let', 'listed', 'make', 'may', 'more', 'need',
        'nominations', 'one', 'please', 'preferences', 'real',
        'references', 'relevant', 'review', 'reviewer', 'section',
        'statistics', 'style', 'subsection', 'suggestions', 'think',
        'tidying', 'transport', 'turns', 'types', 'want', 'wikipedia',
        'wondered'], dtype='<U4955')]

In [12]:
train_corpus['comment_text'][3]

'more make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport'

In [15]:
#test dataset 
sub_corpus = pd.read_csv('test.csv')
sub_corpus['comment_text'] = sub_corpus['comment_text'].map(lambda el : normalize_text(el))

In [16]:
sub_vect=vectorizer.transform(sub_corpus['comment_text'])

In [22]:
models_bench = { 'NaiveBayes': MultinomialNB(),
                 'LogisticRegression': LogisticRegression(solver='sag'),
                 'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=3)
               }

submission_df_NB = sub_corpus.drop(['comment_text'], axis=1)
submission_df_LR = sub_corpus.drop(['comment_text'], axis=1)
submission_df_RF = sub_corpus.drop(['comment_text'], axis=1)

df_x = train_corpus.comment_text
for category in toxic_columns:
    print('Category : {}'.format(category))
    df_y = train_labels[category]
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)
    x_train_tf=vectorizer.transform(x_train)
    x_test_tf=vectorizer.transform(x_test)
    for key in models_bench:
        clf=models_bench[key]
        clf.fit(x_train_tf,y_train)
        predictions = clf.predict(x_test_tf)
        print('--- {} model accuracy is {}'.format(key,accuracy_score(y_test, predictions)))
        if key == 'NaiveBayes': 
            submission_df_NB.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
        if key == 'LogisticRegression': 
            submission_df_LR.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
        if key == 'RandomForest': 
            submission_df_RF.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
            
submission_df_NB.to_csv('submission_NB.csv', index=False)
submission_df_LR.to_csv('submission_LR.csv', index=False)
submission_df_RF.to_csv('submission_RF.csv', index=False)

Category : toxic
--- NaiveBayes model accuracy is 0.920037599874667
--- LogisticRegression model accuracy is 0.955068149772834
--- RandomForest model accuracy is 0.902835657214476
Category : severe_toxic
--- NaiveBayes model accuracy is 0.9896287012376626
--- LogisticRegression model accuracy is 0.9901613661287796
--- RandomForest model accuracy is 0.989691367695441
Category : obscene
--- NaiveBayes model accuracy is 0.9523734920883598
--- LogisticRegression model accuracy is 0.9766567444775184
--- RandomForest model accuracy is 0.9467961773460755
Category : threat
--- NaiveBayes model accuracy is 0.997086009713301
--- LogisticRegression model accuracy is 0.9973053423155256
--- RandomForest model accuracy is 0.9971486761710794
Category : insult
--- NaiveBayes model accuracy is 0.9516841610527965
--- LogisticRegression model accuracy is 0.9691054363152123
--- RandomForest model accuracy is 0.9500548331505562
Category : identity_hate
--- NaiveBayes model accuracy is 0.9906000313332289
--

In [23]:
print('Kaggle NB score -> 0.85244')
print('Kaggle LR score -> 0.97424')
print('Kaggle RF score -> 0.82695')

Kaggle NB score -> 0.85244
Kaggle LR score -> 0.97424
Kaggle RF score -> 0.82695


In [30]:
metrics = defaultdict()
models_bench = { 'NaiveBayes': MultinomialNB(),
                 'LogisticRegression': LogisticRegression(solver='sag'),
                 'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=3)
               }

submission_df_NB = sub_corpus.drop(['comment_text'], axis=1)
submission_df_LR = sub_corpus.drop(['comment_text'], axis=1)
submission_df_RF = sub_corpus.drop(['comment_text'], axis=1)

df_x = train_corpus.comment_text

for category in toxic_columns:
    print('Category : {}'.format(category))
    metrics[category] = {}
    df_y = train_labels[category]
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)
    x_train_tf=vectorizer.transform(x_train)
    x_test_tf=vectorizer.transform(x_test)
    for key in models_bench:
        metrics[category][key] = {}
        clf=models_bench[key]
        clf.fit(x_train_tf,y_train)
        predictions = clf.predict(x_test_tf)
        predictions_proba = clf.predict_proba(x_test_tf)
        metrics[category][key]['accuracy'] = accuracy_score(y_test, predictions)
        metrics[category][key]['roc-auc'] = roc_auc_score(y_test, predictions_proba[:,1])
        print('--- {} model'.format(key))
        print('--- --- accuracy is {}'.format(metrics[category][key]['accuracy']))
        print('--- --- roc-auc  is {}'.format(metrics[category][key]['roc-auc']))
        if key == 'NaiveBayes': 
            submission_df_NB.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
        if key == 'LogisticRegression': 
            submission_df_LR.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
        if key == 'RandomForest': 
            submission_df_RF.loc[:,category] = clf.predict_proba(sub_vect)[:,1]
            
submission_df_NB.to_csv('submission_NB.csv', index=False)
submission_df_LR.to_csv('submission_LR.csv', index=False)
submission_df_RF.to_csv('submission_RF.csv', index=False)

Category : toxic
--- NaiveBayes model
--- --- accuracy is 0.920037599874667
--- --- roc-auc  is 0.8791334650084888
--- LogisticRegression model
--- --- accuracy is 0.955068149772834
--- --- roc-auc  is 0.9709632768584783
--- RandomForest model
--- --- accuracy is 0.902835657214476
--- --- roc-auc  is 0.8782164703831514
Category : severe_toxic
--- NaiveBayes model
--- --- accuracy is 0.9896287012376626
--- --- roc-auc  is 0.8874575458289492
--- LogisticRegression model
--- --- accuracy is 0.9901613661287796
--- --- roc-auc  is 0.9855080845520996
--- RandomForest model
--- --- accuracy is 0.989691367695441
--- --- roc-auc  is 0.8968909988015543
Category : obscene
--- NaiveBayes model
--- --- accuracy is 0.9523734920883598
--- --- roc-auc  is 0.8812767565492993
--- LogisticRegression model
--- --- accuracy is 0.9766567444775184
--- --- roc-auc  is 0.9837537239955682
--- RandomForest model
--- --- accuracy is 0.9467961773460755
--- --- roc-auc  is 0.8711489347586419
Category : threat
--- N

In [31]:
metrics

defaultdict(None,
            {'toxic': {'NaiveBayes': {'accuracy': 0.920037599874667,
               'roc-auc': 0.8791334650084888},
              'LogisticRegression': {'accuracy': 0.955068149772834,
               'roc-auc': 0.9709632768584783},
              'RandomForest': {'accuracy': 0.902835657214476,
               'roc-auc': 0.8782164703831514}},
             'severe_toxic': {'NaiveBayes': {'accuracy': 0.9896287012376626,
               'roc-auc': 0.8874575458289492},
              'LogisticRegression': {'accuracy': 0.9901613661287796,
               'roc-auc': 0.9855080845520996},
              'RandomForest': {'accuracy': 0.989691367695441,
               'roc-auc': 0.8968909988015543}},
             'obscene': {'NaiveBayes': {'accuracy': 0.9523734920883598,
               'roc-auc': 0.8812767565492993},
              'LogisticRegression': {'accuracy': 0.9766567444775184,
               'roc-auc': 0.9837537239955682},
              'RandomForest': {'accuracy': 0.94679617734