In [40]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

<h2>Organize Data</h2>

In [41]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,a vote for max is a vote for the white christi...,0,1,0,0,0,0,0,0,1
1,what a muslim does in the privacy of his own g...,1,0,0,0,1,1,0,0,0
2,it is in some circles but in others it the sam...,1,0,0,1,1,0,0,0,0
3,president trumps opinion get the son of the bi...,0,1,1,1,1,0,0,1,1
4,id2020 is also jabbing refugees with its micro...,0,1,1,0,0,0,0,1,1


In [42]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [43]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [44]:
# Set number of samples to train with
train_df = train_df.iloc[:1000]

<h4>Subtask 1 and 2</h4>

In [45]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [46]:
# Train
t1a2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1a2_headers])
t1a2_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [47]:
predictions_t1a2 = t1a2_classifier.predict(test_df["text"])
print("SVM subtask 1 and 2 Accuracy Score -> ",accuracy_score(predictions_t1a2, test_df[t1a2_headers]))

SVM subtask 1 and 2 Accuracy Score ->  0.29374793251736686


<h5>Export Model</h5>

In [48]:
filename = "t1a2_classifier.joblib.z"
path = "../out/final/" + filename
dump(t1a2_classifier, path)

['../out/final/t1a2_classifier.joblib.z']

<h5>Tests</h5>

In [49]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1a2_classifier.classes_
    proba = t1a2_classifier.predict_proba(arr)[0]
    pred = t1a2_classifier.predict(arr)[0]

    headers = t1a2_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 18.248094293660724
  NOT: 81.02319320262275
  OFFN: 54.329591641274
  PRFN: 98.71625589022808
PREDICTION: [0 1 1 1]

INPUT: gay people are mentally ill
  HATE: 36.0089302403114
  NOT: 43.04751419328845
  OFFN: 45.20998915667063
  PRFN: 90.58822250520863
PREDICTION: [0 0 0 1]

INPUT: Islam people are all terrorists
  HATE: 68.03683841540081
  NOT: 18.67883919124432
  OFFN: 57.00068719897013
  PRFN: 39.24435256738386
PREDICTION: [1 0 1 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 62.33227732263057
  NOT: 34.017320577359925
  OFFN: 56.107147191067455
  PRFN: 91.24338937306197
PREDICTION: [1 0 1 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 12.841873980423651
  NOT: 62.95716372923243
  OFFN: 43.23584695779348
  PRFN: 73.06755946481432
PREDICTION: [0 1 0 1]



<h4>Subtask 3</h4>

In [50]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [51]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [52]:
predictions_t3 = t3_classifier.predict(test_df["text"])
print("SVM subtask 1 and 2 Accuracy Score -> ",accuracy_score(predictions_t3, test_df[t3_headers]))

SVM subtask 1 and 2 Accuracy Score ->  0.2911015547469401


<h5>Export Model</h5>

In [53]:
filename = "t3_classifier.joblib.z"
path = "../out/final/" + filename
dump(t1a2_classifier, path)

['../out/final/t3_classifier.joblib.z']

<h5>Tests</h5>

In [54]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 13.753398199546629
  Religion: 2.4152463436371407
  Gender: 30.30845346773612
  Other: 24.76367743910931
  None: 60.49993798393177
PREDICTION: [0 0 0 0 1]

INPUT: gay people are mentally ill
  Race: 8.05611811145
  Religion: 2.4742777459840872
  Gender: 95.73392014453248
  Other: 16.424569844445223
  None: 50.0
PREDICTION: [0 0 1 0 1]

INPUT: Islam people are all terrorists
  Race: 20.703891685243363
  Religion: 99.99987991317106
  Gender: 4.699465434775692
  Other: 21.457742511506726
  None: 64.78257553920089
PREDICTION: [0 1 0 0 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 67.48256070679678
  Religion: 24.72148427597631
  Gender: 11.420917082881987
  Other: 41.86325508634739
  None: 44.66537854466756
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 3.3648708232378572
  Religion: 6