In [1]:
# Import libraries.  Not all of these needed for XGBoost, but may be explored later
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
# Load training and testing data
train = pd.read_csv('./train.csv/train.csv')
test = pd.read_csv('./test.csv/test.csv')

In [3]:
# Look at training rows
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Look at testing rows
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
# Set y to labels
y = train.iloc[:, 2:].values

In [6]:
# Train test split
# Stratification gives me an error.  Need to label encode?
X_train, X_test , y_train, y_test = train_test_split(train.comment_text.values, y, random_state=5, test_size=0.1, shuffle=True) 

In [7]:
# Look at shapes of data
print(X_train.shape)
print(X_test.shape)

(143613,)
(15958,)


## Preprocessing

In [8]:
# Tfidf vectorizer - removing stop words and getting rid of accents on letters
tfidf = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fit Tfidf vectorizer on training data.  Transforming training and testing sets
tfidf.fit(list(X_train))
X_train_vec =  tfidf.transform(X_train) 
X_test_vec = tfidf.transform(X_test)

## Modeling

In [10]:
# For our multilabel classification, we need OneVsRestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [11]:
# Instaniate XGBoost Decision Tree classifer
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

# Instantiate OneVsRestClassifier with XGBoost Decision Tree
OVRC = OneVsRestClassifier(clf, n_jobs=-1)

# Fit OneVsRestClassifier with XGBoost Decision Tree
# tocsc() converts numpy array to compressed sparse array
OVRC.fit(X_train_vec.tocsc(), y_train)



OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=200, nthread=10,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
          n_jobs=-1)

In [13]:
# Score training data.  This is a harsh metric for multilabel because all 6 classes must be right
OVRC.score(X_train_vec.tocsc(), y_train)

0.93247129438142784

**Pretty good!**

In [14]:
# Score testin data.  This is a harsh metric for multilabel because all 6 classes must be right
OVRC.score(X_test_vec.tocsc(), y_test)

0.91960145381626768

**Also pretty good!**

## Submissions

In [31]:
# Load submissions sample .csv and look at rows
submission = pd.read_csv('./sample_submission.csv/sample_submission.csv')
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [18]:
# Make X the comments of training set
X = train.comment_text.values
X

array([ "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communi

In [20]:
# Fit Tfidf on entire training set
tfidf.fit(list(X))

# Transform training set
train_vec = tfidf.transform(X)


AttributeError: 'TfidfVectorizer' object has no attribute 'transfrom'

**Spelling error - moved line 6 to different cell so I don't have to refit the vectorizer**

In [22]:
# Transform testing set
test_vec = tfidf.transform(test.comment_text.values)

In [27]:
# Fit classifier to training set
OVRC.fit(train_vec.tocsc(), y)

OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=200, nthread=10,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
          n_jobs=-1)

In [28]:
# Make test prediction probabilities
test_predictions = OVRC.predict_proba(test_vec.tocsc())

In [29]:
# Look at shape of predictions
test_predictions.shape

(153164, 6)

In [32]:
# Look at submission shape.  Missing a ID column
submission.shape

(153164, 7)

In [33]:
# Make list of columns to use for dataframe submission
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [35]:
# Look again at test head
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [36]:
# Look at test shape
test.shape

(153164, 2)

In [37]:
# Drop comment text column
test.drop('comment_text', axis=1, inplace=True)

In [38]:
# Make predictions dataframe
test_predictions_df = pd.DataFrame(test_predictions, columns=columns)

In [39]:
# Look at predictions dataframe - Need Id
test_predictions_df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.9912,0.718928,0.994746,0.008066,0.928549,0.232731
1,0.042341,0.002868,0.01442,0.000767,0.02074,0.003236
2,0.03705,0.001989,0.010532,0.00064,0.015684,0.003236
3,0.01516,0.001036,0.007644,0.000686,0.007423,0.00076
4,0.055321,0.002386,0.011534,0.002511,0.018812,0.002084


In [40]:
# Concatenate test Id with predictions dataframe to make submission dataframe
sub_df = pd.concat([test, test_predictions_df], axis=1)

# Look at dataframe
sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.991200,0.718928,0.994746,0.008066,0.928549,0.232731
1,0000247867823ef7,0.042341,0.002868,0.014420,0.000767,0.020740,0.003236
2,00013b17ad220c46,0.037050,0.001989,0.010532,0.000640,0.015684,0.003236
3,00017563c3f7919a,0.015160,0.001036,0.007644,0.000686,0.007423,0.000760
4,00017695ad8997eb,0.055321,0.002386,0.011534,0.002511,0.018812,0.002084
5,0001ea8717f6de06,0.017536,0.000926,0.008971,0.000561,0.010092,0.002713
6,00024115d4cbde0f,0.011623,0.000432,0.008804,0.000362,0.007285,0.001346
7,000247e83dcc1211,0.116876,0.002868,0.014609,0.000706,0.020740,0.003236
8,00025358d4737918,0.112714,0.001302,0.014880,0.000749,0.030965,0.001580
9,00026d1092fe71cc,0.020991,0.000280,0.005518,0.000103,0.006186,0.001273


In [61]:
# Save file for submission
sub_df.to_csv('./sub.csv', index=False)

**We scored 0.9697 ROC AUC in the Kaggle competition.  Less than 0.02 than the top submission on 2/14/18** 

## Let's compare regular GradientBoostingClassifier

In [41]:
# Import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [42]:
# Instantiate GradientBoostingClassifier with as much of the same hyperparameters at XGBoost
gradient_boost = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, subsample=0.8, max_depth=7)

In [43]:
# Instantiate OneVsRestClassifier with Gradient Boost
OVRC_gradient = OneVsRestClassifier(gradient_boost, n_jobs=-1)

In [44]:
# Fit Gradient Boost on training
OVRC_gradient.fit(train_vec, y)

KeyboardInterrupt: 

In [46]:
test.shape

(153164, 1)