### Toxic Comment Classification Challenge

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np 
import pandas as pd
import string
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from textblob import TextBlob
from nltk.stem.wordnet import WordNetLemmatizer 
import gensim

In [3]:
df = pd.read_csv('train-2.csv')

### Data Overview

In [4]:
df.head(5)
df.describe()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


### Add Variable

In [5]:
rowsums=df.iloc[:,2:].sum(axis=1)
df['clean']=(rowsums==0) *1

In [6]:
#Create indirect features to help compensate for the loss of information when cleaning the dataset:

#Sentense count in each comment:
    #  '\n' can be used to count the number of sentences in each comment
df['count_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
#Word count in each comment:
df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
df['count_letters']=df["comment_text"].apply(lambda x: len(str(x)))
#upper case words count
df["count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#Average length of the words
df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [7]:
#Add 3 "emotional" puncs
#count of !
df['count_!']=df["comment_text"].apply(lambda x: len([w for w in x if w == "!"]))
#count of ?
df['count_?']=df["comment_text"].apply(lambda x: len([w for w in x if w == "?"]))
#count of ^
df['count_^']=df["comment_text"].apply(lambda x: len([w for w in x if w == "^"]))

In [8]:
#derived features
#Word count percent in each comment:
df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
#derived features
# Cap word percent in each comment
df['cap_percent']=df["count_words_upper"]*100/df['count_word']
#remove cap count after calculate the percengtage
df.drop('count_words_upper', axis=1, inplace=True)

In [9]:
df['sentiment'] = df['comment_text'].apply(lambda x : TextBlob(x).sentiment)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 56: ordinal not in range(128)

In [None]:
df['polarity'] = df['sentiment'].apply(lambda x : x[0])
df['subjective'] = df['sentiment'].apply(lambda x : x[1])

In [10]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,count_sent,count_word,count_unique_word,count_letters,mean_word_len,count_!,count_?,count_^,word_unique_percent,cap_percent
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,2,43,41,264,5.162791,0,1,0,95.348837,4.651163
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,1,17,17,112,5.588235,1,0,0,100.0,5.882353
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,1,42,39,233,4.571429,0,0,0,92.857143,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,5,113,82,622,4.486726,0,0,0,72.566372,4.424779
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,1,13,13,67,4.230769,0,1,0,100.0,0.0


### Data Cleaning<br>


In [6]:
corpus = df.comment_text

In [7]:
corpus.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [8]:
tokenizer=TweetTokenizer()
lem = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))
pun = set(string.punctuation)

In [9]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    # Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    # remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    # remove usernames
    comment=re.sub("\[\[.*\]","",comment)
 
    
    # Split the sentences into words
    words=tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    # remove stopwords and punctuation
    words = [lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    words = [w for w in words if not w in pun]
    
    clean_sent=" ".join(words)
    
    return(clean_sent)

In [10]:
clean_corpus = corpus.apply(lambda x :clean(x))

In [11]:
df['comment'] = clean_corpus

In [12]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,explanationwhy edit make username hardcore met...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,d'aww match background colour i'm seemingly st...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,hey man i'm really try edit war guy constantly...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,morei can't make real suggestions improvement ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,sir hero chance remember page that's


In [18]:
pd.set_option('display.max_colwidth', -1)
df.comment[0:2]

0    explanationwhy edit make username hardcore metallica fan revert vandalisms closure gas vote new york dolls fac please remove template talk page since i'm retire
1    d'aww match background colour i'm seemingly stick thank talk 21:51 january 11 2016 utc                                                                          
Name: comment, dtype: object

### Crerate subset for text analysis

In [13]:
df_word = pd.concat([df.iloc[:,2:9],df['comment']], axis = 1)

In [14]:
df_word.head(5)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,comment
0,0,0,0,0,0,0,1,explanationwhy edit make username hardcore met...
1,0,0,0,0,0,0,1,d'aww match background colour i'm seemingly st...
2,0,0,0,0,0,0,1,hey man i'm really try edit war guy constantly...
3,0,0,0,0,0,0,1,morei can't make real suggestions improvement ...
4,0,0,0,0,0,0,1,sir hero chance remember page that's


In [15]:
msk = np.random.rand(len(df_word)) < 0.7
train = df_word[msk]
hold = df_word[~msk]

In [16]:
train.shape
hold.shape

(111800, 8)

(47771, 8)

### DTM and ML

In [27]:
# Extracting features from text files
#from sklearn.feature_extraction.text import CountVectorizer
#count_vect = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
#X_train_counts = count_vect.fit_transform(df.comment)

In [67]:
# Shortcut by pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_transformation = Pipeline([('vect', CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)),
                      ('tfidf', TfidfTransformer())])

train_transformed = text_transformation.fit_transform(train['comment'])
hold_transformed = text_transformation.fit_transform(hold['comment'])

In [68]:
train_transformed_features = train_transformed.toarray()

In [70]:
hold_transformed_features = hold_transformed.toarray()

In [71]:
# features
train_transformed_features.shape
hold_transformed_features.shape

(111945, 5000)

(47626, 5000)

In [24]:
# target
train_target = train.iloc[:,0:6]
hold_target = hold.iloc[:,0:6]
train_target.shape

(111800, 6)

#### NB

In [32]:
from sklearn.metrics import accuracy_score

In [74]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(train_transformed_features, train_target)

LabelPowerset(classifier=GaussianNB(priors=None), require_dense=[True, True])

In [75]:
# predict
predictions = classifier.predict(hold_transformed_features)

In [76]:
accuracy_score(hold_target,predictions)

0.74331247637844877

#### DT

In [20]:
# Shortcut by pipeline revise
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_transformation = Pipeline([('vect', CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 500)),
                      ('tfidf', TfidfTransformer())])

train_transformed_revised = text_transformation.fit_transform(train['comment'])
hold_transformed_revised = text_transformation.fit_transform(hold['comment'])

In [21]:
# to array
train_transformed_rev = train_transformed_revised.toarray()
hold_transformed_rev =hold_transformed_revised.toarray()

In [80]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [81]:
dt = DecisionTreeClassifier(random_state=0)

In [82]:
dt.fit(train_transformed_rev, train_target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [83]:
predictions = dt.predict(hold_transformed_rev)

In [84]:
accuracy_score(hold_target,predictions)

0.81302229874438336

#### KNN

In [85]:
from sklearn.neighbors import KNeighborsClassifier

In [87]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(train_transformed_rev, train_target)

In [None]:
pred = knn.predict(hold_transformed_rev)

In [None]:
accuracy_score(hold_target,predictions)

#### RF

In [28]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100) 

Training the random forest...


In [29]:
forest = forest.fit(train_transformed_rev, train_target )

In [30]:
predicitons = forest.predict(hold_transformed_rev)

In [34]:
accuracy_score(hold_target,predicitons)

0.83094345942098757

## Word2vec model

In [56]:
all_sentences = filter(None, df['comment'])
word2vec_model = gensim.models.Word2Vec(all_sentences, 
                                        size=100, 
                                        window=5, 
                                        min_count=5, 
                                        workers=4)

In [17]:
word2vec_model.init_sims(replace=True)
model_name = 'word2vec_model'
word2vec_model.save(model_name)

In [18]:
def sentence_to_avg(words, embedding):
    vector_size = word2vec_model.vector_size
    mapping = word2vec_model
    avg = np.zeros((vector_size,))
    count = 0
    for w in words:
        try:
            avg += mapping[w]
            count += 1
        except:
            pass
    if count > 0: avg = avg / count
    return avg

In [19]:
df_new = df[["comment", "toxic","severe_toxic","obscene","threat","insult","identity_hate"]].copy()

In [20]:
df_new["comment"] = df_new["comment"].apply([lambda x : sentence_to_avg(x, "word2vec")])

  


In [21]:
df['vector'] = df_new['comment']

In [38]:
df.to_csv('datadata.csv',sep=',')

### Using variables only
### Split dataset

In [None]:
msk = np.random.rand(len(df)) < 0.7
train = df[msk]
test = df[~msk]

In [None]:
x_train = pd.concat([train.ix[:,9:19],train.ix[:,20:22]], axis=1)
x_test = pd.concat([test.ix[:,9:19],test.ix[:,20:22]], axis=1)
y_train = train.iloc[:,2:8]
y_test= test.iloc[:,2:8]

In [None]:
x_train.head()
y_train.head()

### Naive bayes  (multi-lable)

In [24]:
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import accuracy_score

In [25]:
classifier = LabelPowerset(GaussianNB())

In [77]:
len(y_train)

111532

In [62]:
classifier.fit(x_train, y_train)

LabelPowerset(classifier=GaussianNB(priors=None), require_dense=[True, True])

In [70]:
predictions = classifier.predict(x_test)

In [73]:
accuracy_score(y_test,predictions)

0.17897957909198775

### Decision Tree

In [92]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [136]:
dt = DecisionTreeClassifier(random_state=0)

In [137]:
dt.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [138]:
predictions = dt.predict(x_test)

In [139]:
cross_val_score(dt,y_test,predictions,cv = 10)

array([0.89009159, 0.87905912, 0.88530391, 0.87364696, 0.89279767,
       0.89092423, 0.87989176, 0.88634471, 0.89300583, 0.88861128])

In [140]:
accuracy_score(y_test,predictions)

0.8299506650846188

### KNN

In [109]:
from sklearn.neighbors import KNeighborsClassifier

In [110]:
knn = KNeighborsClassifier(n_neighbors=3)

In [112]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [114]:
pred = knn.predict(x_test)

In [115]:
accuracy_score(y_test, pred)

0.8760174025271134

### MLPClassifier Netural network

In [121]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [141]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

In [142]:
mlp.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [143]:
predictions = mlp.predict(x_test)

In [125]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.66      0.21      0.32      4716
          1       0.53      0.02      0.03       498
          2       0.58      0.08      0.14      2614
          3       0.00      0.00      0.00       151
          4       0.53      0.05      0.09      2436
          5       0.00      0.00      0.00       459

avg / total       0.57      0.12      0.19     10874



  'precision', 'predicted', average, warn_for)


In [144]:
accuracy_score(y_test,predictions)

0.8856137721434668

### Random Forest

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [127]:
rd = RandomForestClassifier(n_jobs=2, random_state=0)

In [128]:
rd.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [129]:
predictions = rd.predict(x_test)

In [134]:
accuracy_score(y_test,predictions)

0.8881533753825017