In [30]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [31]:
out_directory = "../out/final-test-wstops/"

<h2>Organize Data</h2>

In [32]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,a vote for max is a vote for the white christi...,0,1,0,0,0,0,0,0,1
1,what a muslim does in the privacy of his own g...,1,0,0,0,1,1,0,0,0
2,it is in some circles but in others it the sam...,1,0,0,1,1,0,0,0,0
3,president trumps opinion get the son of the bi...,0,1,1,1,1,0,0,1,1
4,id2020 is also jabbing refugees with its micro...,0,1,1,0,0,0,0,1,1


In [33]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [34]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [35]:
# Set number of samples to train with
# train_df = train_df.iloc[:1000]

<h4>Subtask 1 and 2</h4>

In [36]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [37]:
# Train
t1a2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1a2_headers])
t1a2_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [38]:
predictions_t1a2 = t1a2_classifier.predict(test_df["text"])
t1a2_score = accuracy_score(predictions_t1a2, test_df[t1a2_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t1a2_score)

SVM subtask 1 and 2 Accuracy Score ->  0.3867019517036057


<h5>Export Model</h5>

In [39]:
filename = "t1a2_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-wstops/t1a2_classifier.joblib.z']

<h5>Tests</h5>

In [40]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1a2_classifier.classes_
    proba = t1a2_classifier.predict_proba(arr)[0]
    pred = t1a2_classifier.predict(arr)[0]

    headers = t1a2_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 7.961345706422309
  NOT: 83.03971793546462
  OFFN: 66.04410454888459
  PRFN: 99.9985851945867
PREDICTION: [0 1 1 1]

INPUT: gay people are mentally ill
  HATE: 45.37183322780173
  NOT: 81.03982181597502
  OFFN: 65.54514882420658
  PRFN: 99.35757293977991
PREDICTION: [0 1 1 1]

INPUT: Islam people are all terrorists
  HATE: 55.17852285076482
  NOT: 24.128901966926094
  OFFN: 59.234728396696134
  PRFN: 20.075699688685493
PREDICTION: [1 0 1 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 11.786232239561315
  NOT: 86.27845647033676
  OFFN: 64.80451313724834
  PRFN: 40.72166070387455
PREDICTION: [0 1 1 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 23.378923680602647
  NOT: 69.41768152928648
  OFFN: 46.01961686839321
  PRFN: 59.17939784401525
PREDICTION: [0 1 0 1]



<h4>Subtask 3</h4>

In [41]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [42]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [43]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.3797552100562355


<h5>Export Model</h5>

In [44]:
filename = "t3_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-wstops/t3_classifier.joblib.z']

<h5>Tests</h5>

In [45]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 11.800675279886848
  Religion: 3.5641463001496634
  Gender: 87.44968728861734
  Other: 8.674922850136216
  None: 73.02662157148518
PREDICTION: [0 0 1 0 1]

INPUT: gay people are mentally ill
  Race: 4.356550734993422
  Religion: 3.3727838097378116
  Gender: 94.19634397999565
  Other: 28.117515687184262
  None: 58.324218556363306
PREDICTION: [0 0 1 0 1]

INPUT: Islam people are all terrorists
  Race: 8.383686412859312
  Religion: 99.99999651956684
  Gender: 5.751269708066402
  Other: 7.10722500042341
  None: 21.972063516762166
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 93.93093354929599
  Religion: 2.7023198363116245
  Gender: 6.97075537716508
  Other: 18.529375975418663
  None: 86.70107789846753
PREDICTION: [1 0 0 0 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 14.5712940972077

<h3>Subtask 1 only</h3>

In [46]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [47]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [48]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1_headers])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [49]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df[t1_headers])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.4343367515712868


<h5>Export Model</h5>

In [50]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-wstops/t1_classifier.joblib.z']

<h5>Tests</h5>

In [51]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 7.916579800995889
  NOT: 83.39522224184441
  OFFN: 65.97628289410085
PREDICTION: [0 1 1]

INPUT: gay people are mentally ill
  HATE: 45.21366070823042
  NOT: 81.39682304084529
  OFFN: 65.47906951918822
PREDICTION: [0 1 1]

INPUT: Islam people are all terrorists
  HATE: 55.018204706132636
  NOT: 23.844232214261186
  OFFN: 59.19200680971124
PREDICTION: [1 0 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 11.72214774134788
  NOT: 86.6206310366791
  OFFN: 64.73605661607574
PREDICTION: [0 1 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 23.266628975378804
  NOT: 69.71144356710232
  OFFN: 46.03518772953804
PREDICTION: [0 1 0]



<h4>Subtask 2</h4>

In [52]:
t2_headers = ["PRFN"]

In [53]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    svm.SVC(kernel='linear', probability=True)
)

In [54]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('svc', SVC(kernel='linear', probability=True))])

In [55]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.8825669864373139


<h5>Export Model</h5>

In [56]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-wstops/t2_classifier.joblib.z']

<h5>Tests</h5>

In [57]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 0.0013130902082559937
  1: 99.99868690979174
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 0.6231605545774854
  1: 99.37683944542249
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 80.05699150580463
  1: 19.94300849419535
PREDICTION: 0

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 59.33053916376998
  1: 40.66946083623003
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 40.762983643491175
  1: 59.237016356508846
PREDICTION: 1



In [58]:
scores = [t1a2_score, t3_score, t1_score, t2_score]
print("with stop words accuracies:")
print(scores)

with stop words accuracies:
[0.3867019517036057, 0.3797552100562355, 0.4343367515712868, 0.8825669864373139]
