In [1]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [2]:
out_directory = "../out/final-test-wstops/"

<h2>Organize Data</h2>

In [3]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,a vote for max is a vote for the white christi...,0,1,0,0,0,0,0,0,1
1,what a muslim does in the privacy of his own g...,1,0,0,0,1,1,0,0,0
2,it is in some circles but in others it the sam...,1,0,0,1,1,0,0,0,0
3,president trumps opinion get the son of the bi...,0,1,1,1,1,0,0,1,1
4,id2020 is also jabbing refugees with its micro...,0,1,1,0,0,0,0,1,1


In [4]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [5]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [6]:
# Set number of samples to train with
train_df = train_df.iloc[:1000]

<h4>Subtask 1 and 2</h4>

In [7]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [8]:
# Train
t1a2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1a2_headers])
t1a2_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [9]:
predictions_t1a2 = t1a2_classifier.predict(test_df["text"])
t1a2_score = accuracy_score(predictions_t1a2, test_df[t1a2_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t1a2_score)

SVM subtask 1 and 2 Accuracy Score ->  0.3003638769434337


<h5>Export Model</h5>

In [10]:
filename = "t1a2_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-wstops/t1a2_classifier.joblib.z']

<h5>Tests</h5>

In [11]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1a2_classifier.classes_
    proba = t1a2_classifier.predict_proba(arr)[0]
    pred = t1a2_classifier.predict(arr)[0]

    headers = t1a2_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 10.86074483566137
  NOT: 87.35148770421883
  OFFN: 52.82239528928648
  PRFN: 94.21242279250056
PREDICTION: [0 1 1 1]

INPUT: gay people are mentally ill
  HATE: 24.97239395527883
  NOT: 52.91257630083391
  OFFN: 45.32401203914989
  PRFN: 91.21566726266458
PREDICTION: [0 1 0 1]

INPUT: Islam people are all terrorists
  HATE: 59.73046374553872
  NOT: 18.898126190779376
  OFFN: 63.88034867187471
  PRFN: 47.96545796637555
PREDICTION: [1 0 1 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 62.424824828365665
  NOT: 44.67360866920734
  OFFN: 45.11847220569966
  PRFN: 78.55074025256398
PREDICTION: [1 0 0 1]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 24.691187907296396
  NOT: 52.10704158652702
  OFFN: 35.039711626037814
  PRFN: 62.2163621579629
PREDICTION: [0 1 0 1]



<h4>Subtask 3</h4>

In [12]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [13]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [14]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.3003638769434337


<h5>Export Model</h5>

In [15]:
filename = "t3_classifier.joblib.z"
path = out_directory + filename
dump(t1a2_classifier, path)

['../out/final-test-wstops/t3_classifier.joblib.z']

<h5>Tests</h5>

In [16]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 7.626356287665141
  Religion: 0.1992496759596491
  Gender: 78.78968096645433
  Other: 9.407612113630941
  None: 76.92155945343985
PREDICTION: [0 0 1 0 1]

INPUT: gay people are mentally ill
  Race: 6.097205813079053
  Religion: 4.875218251681975
  Gender: 95.44910054822495
  Other: 17.948128045435478
  None: 53.77903385513196
PREDICTION: [0 0 1 0 1]

INPUT: Islam people are all terrorists
  Race: 22.7210320562585
  Religion: 99.99999432732113
  Gender: 2.2639616367441153
  Other: 22.240348745357355
  None: 46.147919312301426
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 82.44050938577479
  Religion: 17.518137485481542
  Gender: 16.437981596828987
  Other: 41.014029333384684
  None: 36.15419852077032
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 6.0596272328092

<h3>Subtask 1 only</h3>

In [17]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [18]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [19]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1_headers])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [20]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df[t1_headers])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.3420443268276546


<h5>Export Model</h5>

In [21]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-wstops/t1_classifier.joblib.z']

<h5>Tests</h5>

In [22]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 11.571920280784711
  NOT: 85.30323345669869
  OFFN: 52.815074209484116
PREDICTION: [0 1 1]

INPUT: gay people are mentally ill
  HATE: 25.29432591556929
  NOT: 52.38135583857137
  OFFN: 44.39965880562885
PREDICTION: [0 1 0]

INPUT: Islam people are all terrorists
  HATE: 58.15431493914891
  NOT: 20.65195807687798
  OFFN: 64.83474878686266
PREDICTION: [1 0 1]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 60.694302140500824
  NOT: 44.83201398598294
  OFFN: 44.17004908962452
PREDICTION: [1 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 25.02598119621956
  NOT: 51.642886952438005
  OFFN: 33.093184755471796
PREDICTION: [0 1 0]



<h4>Subtask 2</h4>

In [23]:
t2_headers = ["PRFN"]

In [24]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    svm.SVC(kernel='linear', probability=True)
)

In [25]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('svc', SVC(kernel='linear', probability=True))])

In [26]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.85941118094608


<h5>Export Model</h5>

In [27]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final-test-wstops/t2_classifier.joblib.z']

<h5>Tests</h5>

In [28]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 5.509623128088467
  1: 94.49037687191154
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 8.397935547321769
  1: 91.6020644526782
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 51.210107874623
  1: 48.78989212537701
PREDICTION: 1

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 20.745305746964966
  1: 79.25469425303505
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 36.860428225209695
  1: 63.139571774790305
PREDICTION: 1



In [29]:
scores = [t1a2_score, t3_score, t1_score, t2_score]
print("with stop words accuracies:")
print(scores)

no stop words accuracies:
[0.3003638769434337, 0.3003638769434337, 0.3420443268276546, 0.85941118094608]
