In [1]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy as sp
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# !unzip train.csv.zip

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('train.csv')

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)      ### conversion of contraction words to expanded words
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)                                                 ### removing non-word characters

    text = re.sub(r'fck', 'fuck', text)
    text = re.sub(r'a$$', 'ass', text)
    text = re.sub(r'@', 'at', text)
    text = re.sub(r'wikipedia:[^\s]+',' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub('(utc)', ' ' ,text)
    text = re.sub(' u ', ' you ' ,text)


    text = re.sub('[^A-Za-z\' ]+', '',text)                                        ### removing all non-letter values(Except single quotes)
    text = re.sub('\s+', ' ', text)

    text = text.strip(' ')
    text = ' '.join([word for word in text.split() if word not in (stop_words)])    ### Stopwords removal
    return text

data["comment_text"] = data["comment_text"].apply(clean_text)
train_df, test_df = train_test_split( data, train_size=.85)

In [4]:
train_data = train_df["comment_text"]
test_data = test_df["comment_text"]
train_label=train_df[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]
test_label=test_df[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]

In [5]:
all_data = pd.concat([train_data, test_data])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    ngram_range=(1, 1),
    norm='l2',
    max_features=15000)
word_vectorizer.fit(all_data)
train_word_features = word_vectorizer.transform(train_data)
test_word_features = word_vectorizer.transform(test_data)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(1, 5),
    norm='l2',
    max_features=50000)
char_vectorizer.fit(all_data)
train_char_features = char_vectorizer.transform(train_data)
test_char_features = char_vectorizer.transform(test_data)

train_full_feature = sp.sparse.hstack([train_word_features,train_char_features])
test_full_feature = sp.sparse.hstack([test_word_features,test_char_features])

In [None]:
target_list = test_label.columns
acc_list = np.zeros((2,6))
for i in range(len(target_list)):
  target = target_list[i]
  print(target)
  model=XGBClassifier(learning_rate=0.01,max_depth= 5, subsample= 0.8,n_estimators=50)
  model.fit(train_full_feature,train_label[target])
  predict_train = model.predict(train_full_feature)
  acc_temp = accuracy_score(predict_train,train_label[target])
  print("     Train accuracy = ",acc_temp)
  acc_list[0,i] = acc_temp
  predict_test = model.predict(test_full_feature)
  acc_temp = accuracy_score(predict_test,test_label[target])
  print("     Test accuracy = ",acc_temp)
  acc_list[1,i] = acc_temp

print(acc_list)

toxic
     Train accuracy =  0.9405155072036899
     Test accuracy =  0.9239894963893839
severe_toxic
     Train accuracy =  0.9913580788489137
     Test accuracy =  0.9940291975366532
obscene
     Train accuracy =  0.9772953732194446
     Test accuracy =  0.9568601706836725
threat
     Train accuracy =  0.9977439509685344
     Test accuracy =  0.9965925786989278
insult
     Train accuracy =  0.9693804012007194
     Test accuracy =  0.954812591828441
identity_hate
     Train accuracy =  0.9930814496368388
     Test accuracy =  0.9914970771202601
[[0.94051551 0.99135808 0.97729537 0.99774395 0.9693804  0.99308145]
 [0.9239895  0.9940292  0.95686017 0.99659258 0.95481259 0.99149708]]


In [None]:
print("XGBoost performance \n","performance on each label :\n",acc_list,"\n train mean accuracy :",np.mean(acc_list,axis=1)[0],"\n test mean accuracy :"
,np.mean(acc_list,axis=1)[1])

XGBoost performance 
 performance on each label :
 [[0.94051551 0.99135808 0.97729537 0.99774395 0.9693804  0.99308145]
 [0.9239895  0.9940292  0.95686017 0.99659258 0.95481259 0.99149708]] 
 train mean accuracy : 0.9782291266666667 
 test mean accuracy : 0.9696301866666666


In [None]:
target_list = test_label.columns
acc_list = np.zeros((2,6))
for i in range(len(target_list)):
  target = target_list[i]
  print(target)
  model= LogisticRegression(solver='liblinear',dual=True) 
  model.fit(train_full_feature,train_label[target])
  predict_train = model.predict(train_full_feature)
  acc_temp = accuracy_score(predict_train,train_label[target])
  print("     Train accuracy = ",acc_temp)
  acc_list[0,i] = acc_temp
  predict_test = model.predict(test_full_feature)
  acc_temp = accuracy_score(predict_test,test_label[target])
  print("     Test accuracy = ",acc_temp)
  acc_list[1,i] = acc_temp

print(acc_list)

toxic
     Train accuracy =  0.9691451321561544
     Test accuracy =  0.9612717245989305
severe_toxic
     Train accuracy =  0.9919268625354812
     Test accuracy =  0.9903492647058824
obscene
     Train accuracy =  0.9841043978324179
     Test accuracy =  0.9799465240641712
threat
     Train accuracy =  0.9975080178420024
     Test accuracy =  0.9974515374331551
insult
     Train accuracy =  0.977653260589081
     Test accuracy =  0.9728442513368984
identity_hate
     Train accuracy =  0.993615217311166
     Test accuracy =  0.9922292780748663
[[0.96914513 0.99192686 0.9841044  0.99750802 0.97765326 0.99361522]
 [0.96127172 0.99034926 0.97994652 0.99745154 0.97284425 0.99222928]]


In [None]:
print("Logistic regression performance \n","performance on each label :\n",acc_list,"\n train mean accuracy :",np.mean(acc_list,axis=1)[0],"\n test mean accuracy :"
,np.mean(acc_list,axis=1)[1])

Logistic regression performance 
 performance on each label :
 [[0.96914513 0.99192686 0.9841044  0.99750802 0.97765326 0.99361522]
 [0.96127172 0.99034926 0.97994652 0.99745154 0.97284425 0.99222928]] 
 train mean accuracy : 0.985658815 
 test mean accuracy : 0.9823487616666666
