# Comment Toxicity Challenge

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Reading in all datasets
Here, we read in our datasets. We're only going to use the training set given by Kaggle, and we'll eventually end up splitting up the data with a 5% testing and 95% training split. 

In [2]:
test_labels = pd.read_csv("test_labels.csv")
train_set = pd.read_csv("train.csv")

### Cleaning up the data
In this step, we clean up the data. We use Python's regex library to do that (it takes in a regular expression and transforms it to the programmer's specifications.) For now, we only really want letters in our dataset - anything else is going to be replaced by a space. 

In [3]:
regex = re.compile('[^a-zA-Z]')
fixed = []
size = len(train_set.index)
size_that_my_computer_can_handle = 2000
starting_point = 5000
for i in range(starting_point, starting_point+size_that_my_computer_can_handle):
    temp = regex.sub(' ', train_set['comment_text'][i])
    temp = temp.lower()
    fixed.append(temp)

In [4]:
X = CountVectorizer().fit_transform(fixed).toarray()
y_tox = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,2].values
y_sev_tox = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,3].values
y_obs = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,4].values
y_threat = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,5].values
y_insult = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,6].values
y_hate = train_set.iloc[starting_point:size_that_my_computer_can_handle+starting_point,7].values

In [5]:
tests = {'y_tox' : y_tox, 
         'y_sev_tox' : y_sev_tox, 
         'y_obs' : y_obs, 
         'y_threat' : y_threat, 
         'y_insult' : y_insult, 
         'y_hate' : y_hate}

models = {'y_tox' : GaussianNB(), 
          'y_sev_tox' : GaussianNB(),
          'y_obs' : GaussianNB(),
          'y_threat' : GaussianNB(),
          'y_insult' : GaussianNB(),
          'y_hate' : GaussianNB()}

preds = {}

test_names = ['y_tox', 'y_sev_tox', 'y_obs', 'y_threat', 'y_insult', 'y_hate']

In [6]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    models[i].fit(X_train, y_train)
    preds[i] = models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[425  32]
 [ 35   8]]
0.866
y_sev_tox
[[492   3]
 [  4   1]]
0.986
y_obs
[[455  19]
 [ 21   5]]
0.92
y_threat
[[497   1]
 [  2   0]]
0.994
y_insult
[[464  10]
 [ 22   4]]
0.936
y_hate
[[495   2]
 [  3   0]]
0.99


In [None]:
X = HashingVectorizer().fit_transform(fixed).toarray()
print(X)

In [None]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    models[i].fit(X_train, y_train)
    preds[i] = models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[430  24]
 [ 30  16]]
0.892


In [63]:
svm_models = {'y_tox' : SVC(gamma = 'scale'), 
          'y_sev_tox' : SVC(gamma = 'scale'),
          'y_obs' : SVC(gamma = 'scale'),
          'y_threat' : SVC(gamma = 'scale'),
          'y_insult' : SVC(gamma = 'scale'),
          'y_hate' : SVC(gamma = 'scale')}

In [66]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    svm_models[i].fit(X_train, y_train)
    preds[i] = svm_models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[173   0]
 [ 26   1]]
0.87
y_sev_tox
[[197   0]
 [  3   0]]
0.985
y_obs
[[190   0]
 [ 10   0]]
0.95
y_threat
[[199   0]
 [  1   0]]
0.995
y_insult
[[184   0]
 [ 16   0]]
0.92
y_hate
[[199   0]
 [  1   0]]
0.995


In [37]:
k_nearest_models = {'y_tox' : KNeighborsClassifier(), 
          'y_sev_tox' : KNeighborsClassifier(),
          'y_obs' : KNeighborsClassifier(),
          'y_threat' : KNeighborsClassifier(),
          'y_insult' : KNeighborsClassifier(),
          'y_hate' : KNeighborsClassifier()}

In [67]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    k_nearest_models[i].fit(X_train, y_train)
    preds[i] = k_nearest_models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[176   5]
 [ 15   4]]
0.9
y_sev_tox
[[195   1]
 [  4   0]]
0.975
y_obs
[[187   2]
 [  9   2]]
0.945
y_threat
[[200]]
1.0
y_insult
[[188   2]
 [ 10   0]]
0.94
y_hate
[[198   0]
 [  2   0]]
0.99


In [35]:
random_forest_models = {'y_tox' : RandomForestClassifier(), 
          'y_sev_tox' : RandomForestClassifier(),
          'y_obs' : RandomForestClassifier(),
          'y_threat' : RandomForestClassifier(),
          'y_insult' : RandomForestClassifier(),
          'y_hate' : RandomForestClassifier()}

In [68]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    random_forest_models[i].fit(X_train, y_train)
    preds[i] = random_forest_models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[85  1]
 [10  4]]
0.89
y_sev_tox
[[99  1]
 [ 0  0]]
0.99
y_obs
[[95  0]
 [ 4  1]]
0.96
y_threat
[[99  0]
 [ 1  0]]
0.99
y_insult
[[94  0]
 [ 5  1]]
0.95
y_hate
[[100]]
1.0


In [41]:
decision_tree_models = {'y_tox' : DecisionTreeClassifier(), 
          'y_sev_tox' : DecisionTreeClassifier(),
          'y_obs' : DecisionTreeClassifier(),
          'y_threat' : DecisionTreeClassifier(),
          'y_insult' : DecisionTreeClassifier(),
          'y_hate' : DecisionTreeClassifier()}

In [70]:
for i in test_names:
    X_train, X_test, y_train, y_test = train_test_split(X, tests[i])
    decision_tree_models[i].fit(X_train, y_train)
    preds[i] = decision_tree_models[i].predict(X_test)
    print(i)
    print(confusion_matrix(y_test, preds[i]))
    print(accuracy_score(y_test, preds[i]))

y_tox
[[453   5]
 [ 28  14]]
0.934
y_sev_tox
[[494   1]
 [  5   0]]
0.988
y_obs
[[472   6]
 [  8  14]]
0.972
y_threat
[[498   0]
 [  2   0]]
0.996
y_insult
[[473   8]
 [  9  10]]
0.966
y_hate
[[493   3]
 [  4   0]]
0.986
