# Comment Toxicity Challenge

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from bert_sklearn import BertClassifier
import time

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Reading in all datasets
Here, we read in our datasets. We're only going to use the training set given by Kaggle, and we'll eventually end up splitting up the data with a 5% testing and 95% training split. 

In [2]:
train_set = pd.read_csv("train.csv")

### Cleaning up the data
In this step, we clean up the data. We use Python's regex library to do that (it takes in a regular expression and transforms it to the programmer's specifications.) For now, we only really want letters in our dataset - anything else is going to be replaced by a space. 

In [3]:
regex = re.compile('[^a-zA-Z]')
fixed = []
size = len(train_set.index)-1
size_that_my_computer_can_handle = 5000
starting_point = 0
for i in range(starting_point, starting_point+size_that_my_computer_can_handle):
    temp = regex.sub(' ', train_set['comment_text'][i])
    temp = temp.lower()
    fixed.append(temp)

In [4]:
X = TfidfVectorizer().fit_transform(fixed).toarray()
y_tox = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,2].values
y_sev_tox = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,3].values
y_obs = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,4].values
y_threat = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,5].values
y_insult = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,6].values
y_hate = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,7].values

### Gaussian Naive Bayes
Here, we assign, then run (in the next step) the Gaussian Naive Bayes model on our data set.

In [5]:
tests = {'y_tox' : y_tox, 
         'y_sev_tox' : y_sev_tox, 
         'y_obs' : y_obs, 
         'y_threat' : y_threat, 
         'y_insult' : y_insult, 
         'y_hate' : y_hate}

gnb_models = {'y_tox' : GaussianNB(), 
          'y_sev_tox' : GaussianNB(),
          'y_obs' : GaussianNB(),
          'y_threat' : GaussianNB(),
          'y_insult' : GaussianNB(),
          'y_hate' : GaussianNB()}

preds = {}

test_names = ['y_tox', 'y_sev_tox', 'y_obs', 'y_threat', 'y_insult', 'y_hate']

In [10]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    gnb_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = gnb_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
training time: 1.6362462043762207 s
test time: 0.6083252429962158 s
[[1034  113]
 [  59   44]]
0.8624
y_sev_tox
training time: 1.6341707706451416 s
test time: 0.6078751087188721 s
[[1220    9]
 [  20    1]]
0.9768
y_obs
training time: 1.5643830299377441 s
test time: 0.6018581390380859 s
[[1113   69]
 [  49   19]]
0.9056
y_threat
training time: 1.587792158126831 s
test time: 0.5936472415924072 s
[[1238    3]
 [   9    0]]
0.9904
y_insult
training time: 1.5909390449523926 s
test time: 0.614112138748169 s
[[1130   55]
 [  51   14]]
0.9152
y_hate
training time: 1.633971929550171 s
test time: 0.6051449775695801 s
[[1221   15]
 [  14    0]]
0.9768


### SVM Model
Here, we assign, then run (in the next step) the Support Vector Machine model on our data set.

In [7]:
svm_models = {'y_tox' : SVC(gamma = 'scale'), 
          'y_sev_tox' : SVC(gamma = 'scale'),
          'y_obs' : SVC(gamma = 'scale'),
          'y_threat' : SVC(gamma = 'scale'),
          'y_insult' : SVC(gamma = 'scale'),
          'y_hate' : SVC(gamma = 'scale')}

In [8]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    svm_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = svm_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
training time: 268.9495961666107 s
test time: 69.3998601436615 s
[[1111    1]
 [ 114   24]]
0.908
y_sev_tox
training time: 89.44266390800476 s
test time: 30.478352069854736 s
[[1240    0]
 [  10    0]]
0.992
y_obs
training time: 184.3041708469391 s
test time: 60.187963247299194 s
[[1184    0]
 [  53   13]]
0.9576
y_threat
training time: 64.4728319644928 s
test time: 21.248082876205444 s
[[1246    0]
 [   4    0]]
0.9968
y_insult
training time: 166.2928750514984 s
test time: 55.5584282875061 s
[[1197    2]
 [  41   10]]
0.9656
y_hate
training time: 107.45139002799988 s
test time: 35.520689725875854 s
[[1240    0]
 [  10    0]]
0.992


### K Nearest Neighbors Model
Here, we assign, then run (in the next step) the K Nearest Neighbors model on our data set.

In [6]:
k_nearest_models = {'y_tox' : KNeighborsClassifier(), 
          'y_sev_tox' : KNeighborsClassifier(),
          'y_obs' : KNeighborsClassifier(),
          'y_threat' : KNeighborsClassifier(),
          'y_insult' : KNeighborsClassifier(),
          'y_hate' : KNeighborsClassifier()}

In [7]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    k_nearest_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = k_nearest_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
training time: 3.5514402389526367 s
test time: 130.8472011089325 s
[[1102   11]
 [  81   56]]
0.9264
y_sev_tox
training time: 3.546265125274658 s
test time: 133.85179901123047 s
[[1231    1]
 [  16    2]]
0.9864
y_obs
training time: 3.6207759380340576 s
test time: 132.8175048828125 s
[[1178    6]
 [  27   39]]
0.9736
y_threat
training time: 3.6073989868164062 s
test time: 131.61208963394165 s
[[1245    0]
 [   5    0]]
0.996
y_insult
training time: 3.5680761337280273 s
test time: 132.30676770210266 s
[[1182    3]
 [  50   15]]
0.9576
y_hate
training time: 3.5873069763183594 s
test time: 130.76220393180847 s
[[1242    0]
 [   8    0]]
0.9936


### Random Forest Model
Here, we assign, then run (in the next step) the Random Forest model on our data set.

In [8]:
random_forest_models = {'y_tox' : RandomForestClassifier(), 
          'y_sev_tox' : RandomForestClassifier(),
          'y_obs' : RandomForestClassifier(),
          'y_threat' : RandomForestClassifier(),
          'y_insult' : RandomForestClassifier(),
          'y_hate' : RandomForestClassifier()}

In [9]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    random_forest_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = random_forest_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox




training time: 5.248516798019409 s
test time: 0.12573480606079102 s
[[1123    1]
 [ 100   26]]
0.9192
y_sev_tox




training time: 3.248354911804199 s
test time: 0.06232595443725586 s
[[1236    0]
 [  14    0]]
0.9888
y_obs




training time: 6.632544040679932 s
test time: 0.06646108627319336 s
[[1189    0]
 [  46   15]]
0.9632
y_threat




training time: 2.1768083572387695 s
test time: 0.06307601928710938 s
[[1247    0]
 [   3    0]]
0.9976
y_insult




training time: 6.422931909561157 s
test time: 0.06673407554626465 s
[[1180    0]
 [  59   11]]
0.9528
y_hate




training time: 3.5242860317230225 s
test time: 0.0648648738861084 s
[[1236    0]
 [  14    0]]
0.9888


### Decision Tree Model
Here, we assign, then run (in the next step) the Decision Tree model on our data set.

In [10]:
decision_tree_models = {'y_tox' : DecisionTreeClassifier(), 
          'y_sev_tox' : DecisionTreeClassifier(),
          'y_obs' : DecisionTreeClassifier(),
          'y_threat' : DecisionTreeClassifier(),
          'y_insult' : DecisionTreeClassifier(),
          'y_hate' : DecisionTreeClassifier()}

In [None]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    decision_tree_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = decision_tree_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
training time: 281.47301292419434 s
test time: 0.052870988845825195 s
[[1095   26]
 [  61   68]]
0.9304
y_sev_tox
training time: 59.574655055999756 s
test time: 0.18314409255981445 s
[[1231    6]
 [  11    2]]
0.9864
y_obs
training time: 2393.996304988861 s
test time: 0.05564689636230469 s
[[1171    6]
 [  30   43]]
0.9712
y_threat
training time: 16.826676845550537 s
test time: 0.057665109634399414 s
[[1244    0]
 [   6    0]]
0.9952
y_insult


### Logistic Regression
Here, we assign, then run (in the next step) the Logistic Regression model on our data set.

In [None]:
logistic_regression_models = {'y_tox' : LogisticRegression(solver = 'lbfgs', max_iter = 200), 
          'y_sev_tox' : LogisticRegression(solver = 'lbfgs', max_iter = 200),
          'y_obs' : LogisticRegression(solver = 'lbfgs', max_iter = 200),
          'y_threat' : LogisticRegression(solver = 'lbfgs', max_iter = 200),
          'y_insult' : LogisticRegression(solver = 'lbfgs', max_iter = 200),
          'y_hate' : LogisticRegression(solver = 'lbfgs', max_iter = 200)}

In [None]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    logistic_regression_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = logistic_regression_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

### BERT Classifier
Here, we assign, then run (in the next step) the BERT Classification model on our data set.

In [None]:
bert_models = {'y_tox' : BertClassifier(), 
          'y_sev_tox' : BertClassifier(),
          'y_obs' : BertClassifier(),
          'y_threat' : BertClassifier(),
          'y_insult' : BertClassifier(),
          'y_hate' : BertClassifier()}

In [None]:
for i in test_names:
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    t0=time.time()
    bert_models[i].fit(X_train, y_train)
    print("training time:", (time.time() - t0), "s")
    t1=time.time()
    preds[i] = bert_models[i].predict(X_test)
    print("test time:", (time.time() - t1), "s")
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))