In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.feature_extraction.text  import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from scipy import sparse
import re

seed = 42

In [2]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [3]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [4]:
train = pd.read_csv('train.csv').fillna(' ')
test  = pd.read_csv('test.csv').fillna(' ')

In [5]:
train.shape

(159571, 8)

In [6]:
test.shape

(153164, 2)

In [7]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [8]:
train_text = train['comment_text']
test_text  = test['comment_text']
all_text = pd.concat([train_text,test_text])

In [9]:
import nltk
#nltk.download()

In [10]:
from nltk.corpus import stopwords
stemmer = SnowballStemmer('english')

In [11]:
def clean(single_comment):
    letters_only = re.sub("[^a-zA-Z]", " ", single_comment) 
    words = letters_only.lower().split() 
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops] 
    stemmed_words = [stemmer.stem(w) for w in meaningful_words]
    return( " ".join( stemmed_words ))

In [12]:
clean_train_comments = []
num_train = len(train_text)
for i in range( 0, num_train ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_train ))                                                                    
    clean_train_comments.append( clean( train_text[i] ))

Review 1000 of 159571

Review 2000 of 159571

Review 3000 of 159571

Review 4000 of 159571

Review 5000 of 159571

Review 6000 of 159571

Review 7000 of 159571

Review 8000 of 159571

Review 9000 of 159571

Review 10000 of 159571

Review 11000 of 159571

Review 12000 of 159571

Review 13000 of 159571

Review 14000 of 159571

Review 15000 of 159571

Review 16000 of 159571

Review 17000 of 159571

Review 18000 of 159571

Review 19000 of 159571

Review 20000 of 159571

Review 21000 of 159571

Review 22000 of 159571

Review 23000 of 159571

Review 24000 of 159571

Review 25000 of 159571

Review 26000 of 159571

Review 27000 of 159571

Review 28000 of 159571

Review 29000 of 159571

Review 30000 of 159571

Review 31000 of 159571

Review 32000 of 159571

Review 33000 of 159571

Review 34000 of 159571

Review 35000 of 159571

Review 36000 of 159571

Review 37000 of 159571

Review 38000 of 159571

Review 39000 of 159571

Review 40000 of 159571

Review 41000 of 159571

Review 42000 of 159571

R

In [13]:
clean_test_comments = []
num_test = len(test_text)
for i in range( 0, num_test ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_test ))                                                                    
    clean_test_comments.append( clean( test_text[i] ))

Review 1000 of 153164

Review 2000 of 153164

Review 3000 of 153164

Review 4000 of 153164

Review 5000 of 153164

Review 6000 of 153164

Review 7000 of 153164

Review 8000 of 153164

Review 9000 of 153164

Review 10000 of 153164

Review 11000 of 153164

Review 12000 of 153164

Review 13000 of 153164

Review 14000 of 153164

Review 15000 of 153164

Review 16000 of 153164

Review 17000 of 153164

Review 18000 of 153164

Review 19000 of 153164

Review 20000 of 153164

Review 21000 of 153164

Review 22000 of 153164

Review 23000 of 153164

Review 24000 of 153164

Review 25000 of 153164

Review 26000 of 153164

Review 27000 of 153164

Review 28000 of 153164

Review 29000 of 153164

Review 30000 of 153164

Review 31000 of 153164

Review 32000 of 153164

Review 33000 of 153164

Review 34000 of 153164

Review 35000 of 153164

Review 36000 of 153164

Review 37000 of 153164

Review 38000 of 153164

Review 39000 of 153164

Review 40000 of 153164

Review 41000 of 153164

Review 42000 of 153164

R

In [14]:
train_text[7]

"Your vandalism to the Matt Shirvington article has been reverted.  Please don't do it again, or you will be banned."

In [15]:
clean_train_comments[7]

u'vandal matt shirvington articl revert pleas ban'

In [16]:
test_text[7]

':Dear god this site is horrible.'

In [17]:
clean_test_comments[:7]

[u'yo bitch ja rule succes ever what hate sad mofucka bitch slap ur pethed white face get kiss ass guy sicken ja rule pride da music man dont diss shit nothin wrong bein like tupac brother fuckin white boy get thing right next time',
 u'rfc titl fine imo',
 u'sourc zaw ashton lapland',
 u'look back sourc inform updat correct form guess sourc updat shall updat inform thank messag',
 u'anonym edit articl',
 u'thank understand think high would revert without discuss',
 u'pleas add nonsens wikipedia edit consid vandal quick undon would like experi pleas use sandbox instead thank']

In [18]:
Y = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
X_train, X_test, Y_train, Y_test= train_test_split(clean_train_comments, Y, test_size = 0.3, random_state =42)

In [19]:
np.array(X_train).shape

(111699,)

In [20]:
np.array(X_test).shape

(47872,)

In [42]:
Y_train.head(15)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
54568,0,0,0,0,0,0
3786,0,0,0,0,0,0
22938,0,0,0,0,0,0
137856,0,0,0,0,0,0
143038,0,0,0,0,0,0
101451,0,0,0,0,0,0
58349,0,0,0,0,0,0
63154,1,1,1,1,1,0
83999,0,0,0,0,0,0
156706,0,0,0,0,0,0


In [22]:
np.savetxt("train_text.csv", X_train, delimiter=",", fmt='%s')

In [23]:
np.savetxt("valid_text.csv", X_test, delimiter=",", fmt='%s')

In [24]:
np.savetxt("test_text.csv", clean_test_comments, delimiter=",", fmt='%s')

In [25]:
sources = {'train_text.csv': 'train_comments', 'valid_text.csv': 'test_comments'}

sentences = LabeledLineSentence(sources)

In [26]:
from gensim.models.deprecated.doc2vec import LabeledSentence
model = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



In [27]:
import random
for epoch in range(8):
    print epoch
    model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=8)

0
1
2
3
4
5
6
7


In [44]:
model.most_similar('bad')

  """Entry point for launching an IPython kernel.


[(u'good', 0.7787403464317322),
 (u'make', 0.706427276134491),
 (u'faith', 0.7051371335983276),
 (u'assum', 0.6972181797027588),
 (u'obvious', 0.691033124923706),
 (u'honest', 0.6859908103942871),
 (u'realli', 0.6718940138816833),
 (u'tri', 0.6717443466186523),
 (u'thing', 0.6661306023597717),
 (u'even', 0.6651477813720703)]

In [29]:
model.docvecs['train_comments_2']

array([-0.1116396 , -0.42715713,  0.10256509,  0.09701165, -0.01993297,
       -0.44071347,  0.55885506,  0.8807789 ,  0.03191496, -0.2157298 ,
        0.07807338,  0.0978106 ,  0.05575764,  0.3793846 ,  0.45955893,
       -0.6863066 , -0.5870503 ,  0.382502  ,  0.720011  , -0.10512022,
        0.6894382 ,  0.40016967,  0.3779451 ,  0.15415677, -0.75600237,
       -0.06678248, -0.6987697 ,  0.40393335,  0.8957669 , -0.0610558 ,
        0.19810373,  0.86743224, -0.51474756,  0.27295715, -0.37790024,
        0.29549372, -0.40055388,  0.08288393, -0.28746146,  0.60121924,
       -0.12465143, -0.00802697,  0.36222064, -0.09083948, -0.32818004,
        0.09085173, -0.1549118 ,  0.5189156 , -0.54169405,  0.8759436 ,
        0.5299364 ,  0.26710758,  0.9790635 , -0.00432301, -0.15775971,
        0.10560318,  0.42548305,  0.03232861,  0.1627206 , -1.1066611 ,
       -0.44751596,  0.226549  , -0.7585777 , -0.63186055,  0.13616973,
        0.16330022, -0.24685936, -0.65648806, -0.41068423, -1.01

In [30]:
model.save('./imdb.d2v')

In [31]:
model = Doc2Vec.load('./imdb.d2v')

In [32]:
train_arrays = np.zeros((111699, 100))
train_labels = np.zeros(111699)

for i in range(111699):
    prefix_train = 'train_comments_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train]
    train_labels[i] = Y_train.iloc[i,1]

In [33]:
print train_arrays

[[ 0.36475316 -0.74786586  0.37947309 ...  0.19903496 -0.30466014
  -0.76155365]
 [ 0.15169713 -0.41767457 -0.17864588 ...  0.06461179 -0.37247977
   0.00474855]
 [-0.1116396  -0.42715713  0.10256509 ...  0.20717216  0.15268491
   0.37840417]
 ...
 [-0.31792933 -0.43039265 -0.06008903 ... -0.20367613  0.06838287
   0.3465136 ]
 [-0.47342074 -0.87840748  0.35920382 ... -0.22462685 -0.19586937
   0.16168058]
 [-0.0624507  -0.60673672 -0.03310078 ...  0.55265623 -0.23879986
  -0.15922475]]


In [34]:
print train_labels

[0. 0. 0. ... 0. 0. 0.]


In [35]:
test_arrays = np.zeros((47872, 100))
test_labels = np.zeros(47872)

for i in range(47872):
    prefix_test = 'test_comments_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test]
    test_labels[i] = Y_test.iloc[i,0]

In [36]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
classifier.score(test_arrays, test_labels)

0.905038435828877

In [38]:
pd.DataFrame(classifier.predict_proba(test_arrays))

Unnamed: 0,0,1
0,0.990081,0.009919
1,0.992904,0.007096
2,0.989702,0.010298
3,0.995468,0.004532
4,0.988361,0.011639
5,0.997191,0.002809
6,0.995712,0.004288
7,0.994497,0.005503
8,0.983900,0.016100
9,0.992266,0.007734


In [41]:
roc_auc_score(test_labels, classifier.predict_proba(test_arrays)[:,1])

0.7922795104811692