In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.feature_extraction.text  import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from scipy import sparse
import re

seed = 42

In [3]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [4]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [5]:
train = pd.read_csv('train.csv').fillna(' ')
test  = pd.read_csv('test.csv').fillna(' ')

In [6]:
train.shape

(159571, 8)

In [7]:
test.shape

(153164, 2)

In [8]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [9]:
train_text = train['comment_text']
test_text  = test['comment_text']
all_text = pd.concat([train_text,test_text])

In [10]:
import nltk
#nltk.download()

In [11]:

def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # to improve execution time this conversion should be done once
    stops = set(stopwords.words("english"))                  
     
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [12]:
clean_train_comments = []
num_train = len(train_text)
for i in range( 0, num_train ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_train ))                                                                    
    clean_train_comments.append( review_to_words(train_text[i] ))

Review 1000 of 159571

Review 2000 of 159571

Review 3000 of 159571

Review 4000 of 159571

Review 5000 of 159571

Review 6000 of 159571

Review 7000 of 159571

Review 8000 of 159571

Review 9000 of 159571

Review 10000 of 159571

Review 11000 of 159571

Review 12000 of 159571

Review 13000 of 159571

Review 14000 of 159571

Review 15000 of 159571

Review 16000 of 159571

Review 17000 of 159571

Review 18000 of 159571

Review 19000 of 159571

Review 20000 of 159571

Review 21000 of 159571

Review 22000 of 159571

Review 23000 of 159571

Review 24000 of 159571

Review 25000 of 159571

Review 26000 of 159571

Review 27000 of 159571

Review 28000 of 159571

Review 29000 of 159571

Review 30000 of 159571

Review 31000 of 159571

Review 32000 of 159571

Review 33000 of 159571

Review 34000 of 159571

Review 35000 of 159571

Review 36000 of 159571

Review 37000 of 159571

Review 38000 of 159571

Review 39000 of 159571

Review 40000 of 159571

Review 41000 of 159571

Review 42000 of 159571

R

In [13]:
clean_test_comments = []
num_test = len(test_text)
for i in range( 0, num_test ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_test ))                                                                    
    clean_test_comments.append( review_to_words( test_text[i] ))

Review 1000 of 153164

Review 2000 of 153164

Review 3000 of 153164

Review 4000 of 153164

Review 5000 of 153164

Review 6000 of 153164

Review 7000 of 153164

Review 8000 of 153164

Review 9000 of 153164

Review 10000 of 153164

Review 11000 of 153164

Review 12000 of 153164

Review 13000 of 153164

Review 14000 of 153164

Review 15000 of 153164

Review 16000 of 153164

Review 17000 of 153164

Review 18000 of 153164

Review 19000 of 153164

Review 20000 of 153164

Review 21000 of 153164

Review 22000 of 153164

Review 23000 of 153164

Review 24000 of 153164

Review 25000 of 153164

Review 26000 of 153164

Review 27000 of 153164

Review 28000 of 153164

Review 29000 of 153164

Review 30000 of 153164

Review 31000 of 153164

Review 32000 of 153164

Review 33000 of 153164

Review 34000 of 153164

Review 35000 of 153164

Review 36000 of 153164

Review 37000 of 153164

Review 38000 of 153164

Review 39000 of 153164

Review 40000 of 153164

Review 41000 of 153164

Review 42000 of 153164

R

In [13]:
train_text[:2]

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
Name: comment_text, dtype: object

In [14]:
clean_train_comments[:2]

['explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired',
 'aww matches background colour seemingly stuck thanks talk january utc']

In [15]:
test_text[7]

':Dear god this site is horrible.'

In [16]:
clean_test_comments[:7]

['yo bitch ja rule succesful ever whats hating sad mofuckas bitch slap ur pethedic white faces get kiss ass guys sicken ja rule pride da music man dont diss shit nothin wrong bein like tupac brother fuckin white boys get things right next time',
 'rfc title fine imo',
 'sources zawe ashton lapland',
 'look back source information updated correct form guess source updated shall update information thank message',
 'anonymously edit articles',
 'thank understanding think highly would revert without discussion',
 'please add nonsense wikipedia edits considered vandalism quickly undone would like experiment please use sandbox instead thank']

In [17]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# Note that CountVectorizer comes with its own options to automatically do preprocessing, tokenization, and stop word removal -- for each of these, instead of specifying "None", we could have used a built-in method or specified our own function to use.

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_comments)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [18]:
print train_data_features.shape

(159571, 5000)


In [65]:
Y = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
X_train, X_test, Y_train, Y_test= train_test_split(train_data_features, Y, test_size = 0.3, random_state =42)

In [67]:
X_train.shape

(111699, 5000)

In [68]:
X_test.shape

(47872, 5000)

In [69]:
print train_data_features

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [70]:
Y_train.head(15)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
54568,0,0,0,0,0,0
3786,0,0,0,0,0,0
22938,0,0,0,0,0,0
137856,0,0,0,0,0,0
143038,0,0,0,0,0,0
101451,0,0,0,0,0,0
58349,0,0,0,0,0,0
63154,1,1,1,1,1,0
83999,0,0,0,0,0,0
156706,0,0,0,0,0,0


In [75]:
np.savetxt("train_text_nostem.csv", X_train, delimiter=",", fmt='%s')

KeyboardInterrupt: 

In [None]:
np.savetxt("valid_text_nostem.csv", X_test, delimiter=",", fmt='%s')

In [None]:
np.savetxt("test_text_nostem.csv", clean_test_comments, delimiter=",", fmt='%s')

In [27]:
# Load Google's pre-trained Word2Vec model.
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('~/Downloads/GoogleNews-vectors-negative300.bin', binary=True)

In [28]:
model.most_similar('damn')

[(u'darn', 0.7856366038322449),
 (u'goddamn', 0.7490436434745789),
 (u'freakin', 0.7200237512588501),
 (u'fucking', 0.7125218510627747),
 (u'dang', 0.711518406867981),
 (u'darned', 0.7037780284881592),
 (u'damned', 0.7014271020889282),
 (u'friggin', 0.699498176574707),
 (u'shit', 0.6993836760520935),
 (u'dammit', 0.6927375197410583)]

In [29]:
model['bitch']

array([-0.0078125 , -0.07080078,  0.03320312, -0.00714111,  0.04370117,
        0.27929688, -0.08837891, -0.05395508,  0.0625    ,  0.08398438,
       -0.12011719, -0.15722656, -0.03515625, -0.01342773,  0.07324219,
        0.21875   , -0.0300293 , -0.07617188, -0.17871094, -0.35351562,
       -0.03686523,  0.06787109,  0.46875   , -0.06884766,  0.10888672,
        0.20996094, -0.22265625, -0.04663086,  0.11279297, -0.09375   ,
        0.06079102,  0.05761719, -0.05151367,  0.08789062, -0.00238037,
        0.02258301,  0.34960938,  0.0050354 ,  0.01599121,  0.04833984,
        0.11132812, -0.16308594,  0.34179688,  0.02746582, -0.1875    ,
       -0.15625   , -0.0045166 , -0.00601196, -0.24902344,  0.26171875,
       -0.328125  , -0.01556396, -0.20117188,  0.24511719, -0.13964844,
        0.0859375 , -0.17871094, -0.20507812,  0.1484375 , -0.00830078,
        0.06347656,  0.21484375, -0.06396484, -0.03173828, -0.06933594,
       -0.03063965,  0.08398438, -0.03149414, -0.234375  ,  0.17

In [1]:
## Training the model
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
mlp.fit(X_train,Y_train)

NameError: name 'X_train' is not defined

In [None]:
submission = pd.DataFrame(test['id'])
for target in target_col:
    Y_col = Y_train[target]
    lr.fit(X_train,Y_train)
    submission[target] = lr.predict_proba(X_test)[:,1]

In [30]:
import xgboost as xgb

In [31]:
xgb_model = xgb.XGBClassifier()

In [None]:
parameters = {'objective':['binary:logistic'],
              'learning_rate': [0.05,0.005],
              'max_depth': [3,4,5,6,7,8,9,10],
              'min_child_weight': [1,5,10],
              'silent': [1],
              'subsample': [0.6,0.8,1.0],
              'scale_pos_weight':[18,20,22,24,26,28,30,35,40,45,50]}

In [None]:
random_search = GridSearchCV(xgb_model, param_grid=parameters,  scoring='f1', n_jobs=-1, verbose=3 )

In [76]:
param = {'objective':['binary:logistic'],'max_depth':[4], 'silent':[1],'subsample': [0.8], 'scale_pos_weight':[10,20]}

In [33]:
num_round = 2

In [77]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(xgb_model, param_grid=param,  scoring='f1', n_jobs=-1, verbose=3 )

In [78]:
clf.fit(X_train,Y_train['toxic'])

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] objective=binary:logistic, scale_pos_weight=10, silent=1, max_depth=4, subsample=0.8 
[CV] objective=binary:logistic, scale_pos_weight=10, silent=1, max_depth=4, subsample=0.8 
[CV] objective=binary:logistic, scale_pos_weight=10, silent=1, max_depth=4, subsample=0.8 
[CV] objective=binary:logistic, scale_pos_weight=20, silent=1, max_depth=4, subsample=0.8 
[CV] objective=binary:logistic, scale_pos_weight=20, silent=1, max_depth=4, subsample=0.8 
[CV] objective=binary:logistic, scale_pos_weight=20, silent=1, max_depth=4, subsample=0.8 


Process PoolWorker-63:
Traceback (most recent call last):
Process PoolWorker-64:
Traceback (most recent call last):
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "/Users/athena/anaconda2/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    task = get()
  File "/Users/athena/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    task = get()
  File "/Users/athena/anaconda2/lib/python2.7/site

KeyboardInterrupt: 

In [None]:
X_train