In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [5]:
y = train["toxic"]

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

# TF-IDF model

In [7]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

# Logistic regression

In [8]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.123 


## precision

In [9]:
precision = sum(predictions == yvalid)/len(yvalid)
precision


TypeError: invalid type comparison

# Naive bayes on tfidf

In [10]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.268 


# word count model

In [11]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [33]:
type(xvalid_ctv)
xtrain_ctv.shape

(143613, 6940810)

## logistic on word count

In [12]:
# Fitting a simple Logistic Regression on word Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.136 


## naive bayes on word count

In [13]:
# Fitting a simple Naive Bayes on wordcount
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.773 


## glove model

In [84]:
import os
import zipfile
import pandas as pd
import numpy as np
import csv
import gensim

In [86]:
pre_trained=True
GLOVE_PATH = os.path.dirname(os.getcwd()) + '/models/glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'  # can change it to glove.6B.50d.txt, glove.6B.100d.txt, glove.6B.200d.txt
# load the file
if pre_trained:
    glove = zipfile.ZipFile(GLOVE_PATH, 'r')
    words = pd.read_table(glove.open(GLOVE_FILE), sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
    model = words.as_matrix()
else:
    model = None
# build dictionary
if model is not None:
    dict1 = {word: i for i, word in enumerate(words.index)}
else:
    dict1 = {}

In [90]:
def sentence_vectorize(s):
    words = str(s).lower()
#     words = word_tokenize(words)
#     words = [w for w in words if not w in stop_words]
#     words = [w for w in words if w.isalpha()]
    M = []
    for w in s:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [91]:
xtrain_glv = [sentence_vectorize(sent) for sent in xtrain]

In [94]:
xvalid_glv = [sentence_vectorize(sent) for sent in xvalid]

In [95]:
xtrain_glv = np.array(xtrain_glv)
xvalid_glv = np.array(xvalid_glv)

In [80]:
import word_embed_f as we
from tqdm import tqdm

In [81]:
glove = we.Glove_x()

In [54]:
xvalid_glv = []
for word in xvalid:
    xvalid_glv.append(glove.sentence_vectorize(word))

In [55]:
xtrain_glv = np.array(xtrain_glv)
xvalid_glv = np.array(xvalid_glv)

## Glove on Logistic regression

In [96]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_glv, ytrain)
predictions = clf.predict_proba(xvalid_glv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.244 


In [98]:
predictions

array([[ 0.96917803,  0.03082197],
       [ 0.89498116,  0.10501884],
       [ 0.93664108,  0.06335892],
       ..., 
       [ 0.9673008 ,  0.0326992 ],
       [ 0.94989033,  0.05010967],
       [ 0.9230071 ,  0.0769929 ]])

## word2vec model

## Glove on multinimial NB

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_glv, ytrain)
predictions = clf.predict_proba(xvalid_glv)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [99]:
xtrain_glv

array([[-0.0223094 ,  0.04725993, -0.08643805, ..., -0.02666451,
         0.02380785,  0.0406745 ],
       [-0.03126372,  0.05135206, -0.08620077, ..., -0.0322899 ,
         0.02342631,  0.04898935],
       [-0.02778894,  0.05132752, -0.08328366, ..., -0.01854216,
         0.02830789,  0.05048427],
       ..., 
       [-0.02505256,  0.05282044, -0.08457234, ..., -0.02770276,
         0.03121814,  0.04294974],
       [-0.02544791,  0.04986931, -0.08906916, ..., -0.01282757,
         0.01520749,  0.03127778],
       [-0.03230448,  0.06307249, -0.08154946, ..., -0.03008283,
         0.02679808,  0.06045382]], dtype=float32)

## xgboost on tfidf

In [None]:
# Fitting a simple xgboost on glove features
import xgboost as xgb
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))