In [30]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
%matplotlib inline


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# First model: simple TF_IDF

In [16]:
train["comment_text"].values

array([ "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communi

In [4]:
X = train['comment_text'].values
Y = train[['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [38]:
y_test_np = np.array(y_test)
y_test_np[:,0].shape
y_train.shape
y_test.shape
x_train.shape

(95742,)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [19]:
tfidf_v = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfidf_v.fit(list(x_train)+list(x_test))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [20]:
xtrain_tfv =  tfidf_v.transform(x_train) 
xvalid_tfv = tfidf_v.transform(x_test)

In [43]:
print (xtrain_tfv.shape)
print (y_train.shape)
y_train.shape

(95742, 360873)
(95742, 6)


(95742, 6)

In [44]:
def multiclass_logloss(y_test, predicted, eps):
    clip = np.clip(predicted, eps, 1 - eps)
    rows = y_test.shape[0]
    vsota = np.sum(y_test * np.log(clip))
    return -1.0 / rows * vsota

# Fitting a simple Logistic Regression on TFIDF

In [20]:
labels = ['toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate']


In [34]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0)
predictions = []
for label in labels:
    clf.fit(xtrain_tfv, y_train[label])
    prediction = clf.predict(xvalid_tfv)
    print (prediction.shape)
    predictions.append(prediction)



(63829,)
(63829,)
(63829,)
(63829,)
(63829,)
(63829,)


In [43]:
pre = np.array(predictions)
len(pre)
for i in range(len(pre)):
    print (sum(pre[i] == y_test_np[:,0])/len(pre[i]))

0.948910369895
0.906030174372
0.93230349841
0.903570477369
0.927556439863
0.904212818625


In [33]:
pre.shape 

(6, 63829)

## Use word counts as features.

In [8]:
count_v = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [9]:
count_v.fit(list(x_train) + list(x_test))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [10]:
x_train_ctv = count_v.transform(x_train)
x_test_ctv = count_v.transform(x_test)

In [13]:
labels = ['toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate']

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
predictions = []
for label in labels:
    clf.fit(x_train_ctv, y_train[label])
    prediction = clf.predict(x_test_ctv)
    print (prediction.shape)
    predictions.append(prediction)

(63829,)
(63829,)
(63829,)
(63829,)
(63829,)
(63829,)


In [None]:
pre = np.array(predictions)
len(pre)
for i in range(len(pre)):
    print (sum(pre[i] == y_test_np[:,0])/len(pre[i]))

In [15]:
svd = decomposition.TruncatedSVD(n_components=120)

In [21]:
svd.fit(xtrain_tfv)

TruncatedSVD(algorithm='randomized', n_components=120, n_iter=5,
       random_state=None, tol=0.0)

In [45]:
# apply svd
xtrain_tfv_svd = svd.transform(xtrain_tfv)
xvalid_tfv_svd = svd.transform(xvalid_tfv)
# scale the data
scale = preprocessing.StandardScaler()
scale.fit(xtrain_tfv_svd)
xtrain_tfv_svd_scl = scale.transform(xtrain_tfv_svd)

scale.fit(xvalid_tfv_svd)
xtest_tfv_svd_scl = scale.transform(xvalid_tfv_svd)
# xtrain_ctv_svd = svd.transform(x_train_ctv)
# xtrain_ctv_svd = svd.transform(x_test_ctv)

In [44]:
xtrain_tfv_svd.shape

(95742, 120)

## fitting a simple svm using scaled data from svd

In [46]:
print (xtrain_tfv_svd_scl.shape)
y_train.shape

(95742, 120)


(95742, 6)

In [None]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
predictions = []
for label in labels:
    clf.fit(xtrain_tfv_svd_scl, y_train[label])
    prediction = clf.predict(xtrain_tfv_svd_scl)
    print (prediction.shape)
    predictions.append(prediction)


In [29]:
# apply svd
xtrain_ctv_svd = svd.transform(x_train_ctv)
xvalid_ctv_svd = svd.transform(x_test_ctv)
# scale the data
scale = preprocessing.StandardScaler()
scale.fit(xtrain_ctv_svd)
xtrain_tfv_svd_scl = scale.transform(xtrain_ctv_svd)

scale.fit(xvalid_ctv_svd)
xtrain_tfv_svd_scl = scale.transform(xvalid_ctv_svd)
# xtrain_ctv_svd = svd.transform(x_train_ctv)
# xtrain_ctv_svd = svd.transform(x_test_ctv)

ValueError: dimension mismatch

## fitting with xgboost on tfidf

## fitting with 