In [30]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [31]:
out_directory = "../out/final-test-nostops/"

<h2>Organize Data</h2>

In [32]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,a vote for max is a vote for the white christi...,0,1,0,0,0,0,0,0,1
1,what a muslim does in the privacy of his own g...,1,0,0,0,1,1,0,0,0
2,it is in some circles but in others it the sam...,1,0,0,1,1,0,0,0,0
3,president trumps opinion get the son of the bi...,0,1,1,1,1,0,0,1,1
4,id2020 is also jabbing refugees with its micro...,0,1,1,0,0,0,0,1,1


In [33]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [34]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [35]:
# Set number of samples to train with
# train_df = train_df.iloc[:1000]

<h4>Subtask 1 and 2</h4>

In [36]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [37]:
# Train
t1a2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1a2_headers])
t1a2_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [38]:
predictions_t1a2 = t1a2_classifier.predict(test_df["text"])
t1a2_score = accuracy_score(predictions_t1a2, test_df[t1a2_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t1a2_score)

SVM subtask 1 and 2 Accuracy Score ->  0.39034072113794244


<h5>Export Model</h5>

In [39]:
filename = "t1a2_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-nostops/t1a2_classifier.joblib.z']

<h5>Tests</h5>

In [40]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1a2_classifier.classes_
    proba = t1a2_classifier.predict_proba(arr)[0]
    pred = t1a2_classifier.predict(arr)[0]

    headers = t1a2_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 19.34094222187061
  NOT: 61.779271912698796
  OFFN: 69.92316938604924
  PRFN: 99.9999999999709
PREDICTION: [0 1 1 1]

INPUT: gay people are mentally ill
  HATE: 48.67992557006854
  NOT: 73.27263032828809
  OFFN: 72.41146445440374
  PRFN: 98.96863239469876
PREDICTION: [0 1 1 1]

INPUT: Islam people are all terrorists
  HATE: 49.18705794258973
  NOT: 27.263204849894933
  OFFN: 50.0
  PRFN: 16.00543754801794
PREDICTION: [0 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 37.00219081270031
  NOT: 47.00754411722002
  OFFN: 65.06227043322387
  PRFN: 98.1638015170598
PREDICTION: [0 0 1 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 17.289782571314188
  NOT: 82.10225517172674
  OFFN: 43.754278921580706
  PRFN: 24.185374528516014
PREDICTION: [0 1 0 0]



<h4>Subtask 3</h4>

In [41]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [42]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [43]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.3833939794905723


<h5>Export Model</h5>

In [44]:
filename = "t3_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-nostops/t3_classifier.joblib.z']

<h5>Tests</h5>

In [45]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 22.068550709828425
  Religion: 4.820036569733136
  Gender: 85.47144894867743
  Other: 11.41074283651475
  None: 47.015173708505664
PREDICTION: [0 0 1 0 0]

INPUT: gay people are mentally ill
  Race: 4.0901712430497525
  Religion: 2.951851797444794
  Gender: 96.84546869334537
  Other: 30.674459500369206
  None: 50.81909326730175
PREDICTION: [0 0 1 0 1]

INPUT: Islam people are all terrorists
  Race: 6.974315012089632
  Religion: 99.99999545306156
  Gender: 5.477769694373715
  Other: 6.6962736243897005
  None: 22.26416130048259
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 76.6809546094667
  Religion: 8.735921432654989
  Gender: 7.708548770355665
  Other: 22.191840057143565
  None: 65.1076391680706
PREDICTION: [1 0 0 0 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 10.30940685384564


<h3>Subtask 1 only</h3>

In [46]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [47]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [48]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1_headers])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [49]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df[t1_headers])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.42904399603043336


<h5>Export Model</h5>

In [50]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-nostops/t1_classifier.joblib.z']

<h5>Tests</h5>

In [51]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 19.35464123582532
  NOT: 61.83813268828543
  OFFN: 70.42833729740113
PREDICTION: [0 1 1]

INPUT: gay people are mentally ill
  HATE: 48.69173745923237
  NOT: 73.4120027372516
  OFFN: 72.97846106693707
PREDICTION: [0 1 1]

INPUT: Islam people are all terrorists
  HATE: 49.198726142226135
  NOT: 27.053304969725435
  OFFN: 50.0
PREDICTION: [0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 37.015984163785724
  NOT: 46.93459711988959
  OFFN: 65.42602146831305
PREDICTION: [0 0 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 17.302918773454596
  NOT: 82.27405626641668
  OFFN: 43.36786996358617
PREDICTION: [0 1 0]



<h4>Subtask 2</h4>

In [52]:
t2_headers = ["PRFN"]

In [53]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    svm.SVC(kernel='linear', probability=True)
)

In [54]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('svc', SVC(kernel='linear', probability=True))])

In [55]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.9047304002646378


<h5>Export Model</h5>

In [56]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-nostops/t2_classifier.joblib.z']

<h5>Tests</h5>

In [57]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 1.3471971910918366e-11
  1: 99.99999999998653
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 0.9356237404091984
  1: 99.06437625959079
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 84.95115837014046
  1: 15.048841629859533
PREDICTION: 0

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 1.693657396981723
  1: 98.30634260301827
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 76.8756405306856
  1: 23.124359469314406
PREDICTION: 0



In [58]:
scores = [t1a2_score, t3_score, t1_score, t2_score]
print("no stop words accuracies:")
print(scores)

no stop words accuracies:
[0.39034072113794244, 0.3833939794905723, 0.42904399603043336, 0.9047304002646378]
