In [39]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [40]:
out_directory = "../out/final-test-nostops/"

<h2>Organize Data</h2>

In [41]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,a vote for max is a vote for the white christi...,0,1,0,0,0,0,0,0,1
1,what a muslim does in the privacy of his own g...,1,0,0,0,1,1,0,0,0
2,it is in some circles but in others it the sam...,1,0,0,1,1,0,0,0,0
3,president trumps opinion get the son of the bi...,0,1,1,1,1,0,0,1,1
4,id2020 is also jabbing refugees with its micro...,0,1,1,0,0,0,0,1,1


In [42]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [43]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [44]:
# Set number of samples to train with
# train_df = train_df.iloc[:1000]

<h4>Subtask 1 and 2</h4>

In [45]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [46]:
# Train
t1a2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1a2_headers])
t1a2_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [47]:
predictions_t1a2 = t1a2_classifier.predict(test_df["text"])
t1a2_score = accuracy_score(predictions_t1a2, test_df[t1a2_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t1a2_score)

SVM subtask 1 and 2 Accuracy Score ->  0.29374793251736686


<h5>Export Model</h5>

In [48]:
filename = "t1a2_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-nostops/t1a2_classifier.joblib.z']

<h5>Tests</h5>

In [49]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1a2_classifier.classes_
    proba = t1a2_classifier.predict_proba(arr)[0]
    pred = t1a2_classifier.predict(arr)[0]

    headers = t1a2_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 16.688951269961656
  NOT: 80.4014103515127
  OFFN: 53.94776045476994
  PRFN: 98.96074370607147
PREDICTION: [0 1 1 1]

INPUT: gay people are mentally ill
  HATE: 35.35444396389673
  NOT: 43.59879339763791
  OFFN: 44.836467577003546
  PRFN: 90.90679558369877
PREDICTION: [0 0 0 1]

INPUT: Islam people are all terrorists
  HATE: 69.84350145014395
  NOT: 19.69572802884418
  OFFN: 56.618592796242865
  PRFN: 34.97268521214012
PREDICTION: [1 0 1 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 63.89309203143735
  NOT: 34.80352651865118
  OFFN: 55.72525595857244
  PRFN: 91.59353783121483
PREDICTION: [1 0 1 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 11.317672252060964
  NOT: 62.799544666276105
  OFFN: 42.86882436898312
  PRFN: 71.68835567985627
PREDICTION: [0 1 0 1]



<h4>Subtask 3</h4>

In [50]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [51]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [52]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.2911015547469401


<h5>Export Model</h5>

In [53]:
filename = "t3_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-nostops/t3_classifier.joblib.z']

<h5>Tests</h5>

In [54]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 14.43491640860422
  Religion: 2.89825195805383
  Gender: 29.797242129469804
  Other: 24.594931028273148
  None: 59.569613163122256
PREDICTION: [0 0 0 0 1]

INPUT: gay people are mentally ill
  Race: 8.622738436952131
  Religion: 2.9648306949244514
  Gender: 95.4941147416722
  Other: 15.380402162795507
  None: 50.0
PREDICTION: [0 0 1 0 1]

INPUT: Islam people are all terrorists
  Race: 21.395105523155124
  Religion: 99.99970388068407
  Gender: 4.6727686001530095
  Other: 20.8901010627742
  None: 63.64659491706336
PREDICTION: [0 1 0 0 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 67.03743308152758
  Religion: 25.364683811918837
  Gender: 11.278464823594671
  Other: 44.18450648660156
  None: 45.77299252607448
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 3.723876677582325
  Religion:

<h3>Subtask 1 only</h3>

In [55]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [56]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [57]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1_headers])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [58]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df[t1_headers])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.33245120740985773


<h5>Export Model</h5>

In [59]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-nostops/t1_classifier.joblib.z']

<h5>Tests</h5>

In [60]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 17.851060316972823
  NOT: 80.7070080463895
  OFFN: 53.84108409922527
PREDICTION: [0 1 1]

INPUT: gay people are mentally ill
  HATE: 35.73990999602871
  NOT: 42.672201913973105
  OFFN: 45.78756288617723
PREDICTION: [0 0 0]

INPUT: Islam people are all terrorists
  HATE: 68.23297036324374
  NOT: 18.496165273659003
  OFFN: 56.20019420206632
PREDICTION: [1 0 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 62.4548260720699
  NOT: 33.71503135004421
  OFFN: 55.411081648067025
PREDICTION: [1 0 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 12.47209568472437
  NOT: 62.51237387034363
  OFFN: 44.037257874573044
PREDICTION: [0 1 0]



<h4>Subtask 2</h4>

In [61]:
t2_headers = ["PRFN"]

In [62]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    svm.SVC(kernel='linear', probability=True)
)

In [63]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('svc', SVC(kernel='linear', probability=True))])

In [64]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.8647039364869336


<h5>Export Model</h5>

In [65]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-nostops/t2_classifier.joblib.z']

<h5>Tests</h5>

In [66]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 1.0072771405956031
  1: 98.9927228594044
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 8.918834545458237
  1: 91.08116545454178
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 64.85715792425549
  1: 35.14284207574451
PREDICTION: 1

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 8.241157510842068
  1: 91.75884248915793
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 28.007993838223694
  1: 71.99200616177632
PREDICTION: 1



In [69]:
scores = [t1a2_score, t3_score, t1_score, t2_score]
print("no stop words accuracies:")
print(scores)

no stop words accuracies:
[0.29374793251736686, 0.2911015547469401, 0.33245120740985773, 0.8647039364869336]
