In [13]:
import pandas as pd
import numpy as np
import pickle
import nltk

## Environments

In [2]:
DATA_PATH = "/Users/xiaoma/Desktop/kaggle/toxicComments/data/toxic_balanced.csv"

## Read data

This template assumes clean data

In [17]:
data = pd.read_csv(DATA_PATH)
# sample the data for computation efficiency
data = data.sample(2000)

In [18]:
data.head()

Unnamed: 0.1,Unnamed: 0,comment_text,toxic
20788,20788,"""Do you have any sources regarding Boca Acton?...",0
20731,20731,Using your knowledge as a reference... \n\nNoe...,0
27535,27535,Requested move\n \n\nTakajin → – Alike case t...,0
2248,2248,Mujahedins \n\nWhats wrong with Mujjahedin? Ce...,1
1256,1256,Proper Cunt \n\nNot my fault you are a silly c...,1


## Split into training and testing sets

In [19]:
"""
Training and testing split
"""
import random
random.seed(923)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.comment_text, data.toxic, test_size = 0.25,random_state = 23)

X_train.reset_index(drop = True,inplace = True)
X_test.reset_index(drop = True,inplace = True)
y_train.reset_index(drop = True,inplace = True)
y_test.reset_index(drop = True,inplace = True)

In [20]:
X_train.shape

(1500,)

## Vectorize each document and create DTM

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer_xm import text_tokenizer_xm, contractions

# Build a wrapper for tokenizer
def tokenizer(text):
    tk = text_tokenizer_xm(text = text, lemma_flag = True, stem_flag = True,contractions=contractions)
    return tk.txt_pre_pros()

# Fit the Count vectorizer
vec_count = TfidfVectorizer(ngram_range = (1,1),tokenizer=tokenizer,min_df = 15, max_df = 0.9)
vec_count_f = vec_count.fit(X_train)

# Create the triaining document-term matrix
vec_f = vec_count_f
train_dtm = vec_f.transform(X_train)

## Append additional features

Here we assume that we want to add LDA scores as additional features just as an example

In [29]:
# Convert the DTM to an array/pandas dataframe to include more features
train_input = pd.DataFrame(train_dtm.toarray())

# assign column names
train_input.columns = vec_f.get_feature_names()

# Calculate the LDA scores
topic_num = 5
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = topic_num)
lda_f = lda.fit(train_dtm)
lda_weights = lda_f.transform(train_dtm)

# Append the new features
for i in range(topic_num):
    train_input['lda_topic_{}'.format(i)] = lda_weights[:,i]

## Input into model

In [30]:
from xgboost import XGBClassifier
# initialize
xgbc = XGBClassifier(n_estimators=200)
xgbc_lda = xgbc.fit(train_input,y_train)

## Scoring

In [31]:
# create DTM
test_dtm = vec_f.transform(X_test)

# Transform into dataframe
test_input = pd.DataFrame(test_dtm.toarray())
test_input.columns = vec_f.get_feature_names()

# Get LDA weights for test data
test_lda_weights = lda_f.transform(test_dtm)

# Append the new features
for i in range(topic_num):
    test_input['lda_topic_{}'.format(i)] = test_lda_weights[:,i]

# Make predictions
pred_lda = xgbc_lda.predict(test_input)

# Get Accuray
sum(pred_lda == y_test)/len(y_test)

0.85

If the results are satisfactory, the following needs to be saved for reuse:
1. Trained Model
2. Tokenizer
3. Fitted vectorizer
4. Fitted LDA

In [10]:
import cloudpickle
with open('xgb_model.pkl','wb') as file:
    pickle.dump(xgbc_lda,file)
    
with open('xgb_tokenizer.pkl','wb') as file:
    pickle.dump(tokenizer,file)
    
with open('xgb_vectorizer.pkl','wb') as file:
    # Use Cloudpickle to pickle the dependencies
    cloudpickle.dump(vec_f,file)

with open("xgb_lda.pkl",'wb') as file:
    pickle.dump(lda_f,file)

In [None]:
"""
Draft
"""
fake = pd.DataFrame({"comment_text":["My name is Marshal","My name is Marshal","My name is Marshal"]})
fake

In [None]:
vec_f.transform(fake)

In [None]:
X_test[0]

In [12]:
data.to_csv("toxic_balanced_sampled.csv")