In [1]:
import pandas as pd
import numpy as np
# Model
from sklearn import model_selection, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [2]:
out_directory = "../out/final/"

<h2>Organize Data</h2>

In [3]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,you just do not get it next time you are relyi...,0,0,1,1,0,0,0,1,0
1,it takes a certain kind of stupid to believe s...,0,1,0,1,0,0,1,0,0
2,the looney left liberal sjw cuck a sad delusio...,1,0,0,1,0,0,0,1,0
3,blacks are only of the yous population therefo...,0,0,1,0,1,0,0,0,0
4,its the islamic signal they are ready and will...,1,0,0,1,0,1,0,0,0


In [4]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [5]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


<h2>Model Creation</h2>

In [6]:
# Set number of samples to train with
train_df = train_df.iloc[:500]

In [7]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [8]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [9]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t1_headers])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [10]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df[t1_headers])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.32270363951473136


<h5>Export Model</h5>

In [11]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final/t1_classifier.joblib.z']

<h5>Tests</h5>

In [12]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 3.590859756293651
  NOT: 90.49809912619027
  OFFN: 13.255162967541706
PREDICTION: [0 1 0]

INPUT: gay people are mentally ill
  HATE: 8.168284308970462
  NOT: 53.25412937243309
  OFFN: 24.400044653470694
PREDICTION: [0 1 0]

INPUT: Islam people are all terrorists
  HATE: 20.633088976402604
  NOT: 41.04819423146471
  OFFN: 22.282074779203523
PREDICTION: [0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 24.34378786387297
  NOT: 40.250552153494155
  OFFN: 23.605238347345292
PREDICTION: [0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 6.6608408942812325
  NOT: 29.45033155444252
  OFFN: 39.327734442723774
PREDICTION: [0 0 0]



<h4>Subtask 2</h4>

In [13]:
t2_headers = ["PRFN"]

In [14]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    svm.SVC(kernel='linear', probability=True)
)

In [15]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('svc', SVC(kernel='linear', probability=True))])

In [16]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.8564991334488735


<h5>Export Model</h5>

In [17]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final/t2_classifier.joblib.z']

<h5>Tests</h5>

In [18]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 7.219061321948677
  1: 92.78093867805131
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 4.499056288170764
  1: 95.50094371182924
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 15.61336835048482
  1: 84.38663164951518
PREDICTION: 1

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 8.622309285300322
  1: 91.37769071469968
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 11.62173690178285
  1: 88.37826309821716
PREDICTION: 1



<h4>Subtask 3 - One VS Rest</h4>

In [19]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [20]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [21]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.2873483535528596


<h5>Export Model</h5>

In [23]:
filename = "t3ovr_classifier.joblib.z"
path = out_directory + filename
dump(t3_classifier, path)

['../out/final/t3ovr_classifier.joblib.z']

<h5>Tests</h5>

In [24]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 8.184553325555685
  Religion: 4.244726210357346
  Gender: 10.43746944367469
  Other: 2.953044220030992
  None: 45.23362438086585
PREDICTION: [0 0 0 0 0]

INPUT: gay people are mentally ill
  Race: 1.746839530202693
  Religion: 18.495319016642327
  Gender: 93.19601578011503
  Other: 58.01805266926089
  None: 10.416714708310964
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 5.769682722671366
  Religion: 99.69973181812503
  Gender: 1.6612406885772282
  Other: 21.367213753745034
  None: 12.381419448775748
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 63.66550035883407
  Religion: 4.252159473832259
  Gender: 3.4707540788292954
  Other: 27.478628801877676
  None: 14.86735449772831
PREDICTION: [0 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 12.065440523895

<h4>Subtask 3 - Binary Classifier</h4>

In [25]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)

In [26]:
t3bc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3bc_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('binaryrelevance',
                 BinaryRelevance(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [27]:
predictions_t3bc = t3bc_classifier.predict(test_df["text"])
t3bc_score = accuracy_score(predictions_t3bc, test_df[t3_headers])
print("SVM subtask 3 BC Accuracy Score -> ",t3bc_score)

SVM subtask 3 BC Accuracy Score ->  0.2873483535528596


<h5>Export Model</h5>

In [28]:
filename = "t3bc_classifier.joblib.z"
path = out_directory + filename
dump(t3bc_classifier, path)

['../out/final/t3bc_classifier.joblib.z']

<h5>Tests</h5>

In [40]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3bc_classifier.predict_proba(arr).toarray()[0]
    pred = t3bc_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 7.337034849035698
  Religion: 3.9783132736360756
  Gender: 9.539202874820207
  Other: 2.5793788624673697
  None: 45.2168430534657
PREDICTION: [0 0 0 0 0]

INPUT: gay people are mentally ill
  Race: 1.3803364834147258
  Religion: 19.58361454017402
  Gender: 95.34882224790621
  Other: 63.40307942609512
  None: 10.403723079249952
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 4.990164673005735
  Religion: 99.99928165065906
  Gender: 1.2243582655124137
  Other: 22.514688572274263
  None: 12.367084724422291
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 65.56920524299154
  Religion: 3.9858642925666157
  Gender: 2.770318388953735
  Other: 29.463906275231427
  None: 14.851604111096734
PREDICTION: [0 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 11.1687141801

<h4>Subtask 3 - Classifier Chain</h4>

In [41]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)

In [42]:
t3cc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3cc_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('classifierchain',
                 ClassifierChain(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [43]:
predictions_t3cc = t3cc_classifier.predict(test_df["text"])
t3cc_score = accuracy_score(predictions_t3cc, test_df[t3_headers])
print("SVM subtask 3 CC Accuracy Score -> ",t3cc_score)

SVM subtask 3 CC Accuracy Score ->  0.4856152512998267


<h5>Export Model</h5>

In [44]:
filename = "t3cc_classifier.joblib.z"
path = out_directory + filename
dump(t3cc_classifier, path)

['../out/final/t3cc_classifier.joblib.z']

<h5>Tests</h5>

In [45]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3cc_classifier.predict_proba(arr).toarray()[0]
    pred = t3cc_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 7.983149489163929
  Religion: 3.7705484616404883
  Gender: 12.065269078684155
  Other: 4.832572138522008
  None: 84.91721629850211
PREDICTION: [0. 0. 0. 0. 1.]

INPUT: gay people are mentally ill
  Race: 1.7170055305583471
  Religion: 16.773442737502172
  Gender: 92.09872058735607
  Other: 35.40592708898736
  None: 0.7185430973270114
PREDICTION: [0. 0. 1. 0. 0.]

INPUT: Islam people are all terrorists
  Race: 5.6334441030530575
  Religion: 99.99973683060782
  Gender: 1.078511022639815
  Other: 16.93588355531013
  None: 0.8550955775487236
PREDICTION: [0. 1. 0. 0. 0.]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 62.53570350623865
  Religion: 5.505663900768042
  Gender: 5.337683597493268
  Other: 44.908840148292086
  None: 84.67591306869883
PREDICTION: [0. 0. 0. 0. 1.]

INPUT: If you want to make the world a better place, look at yourself and make a change
  

<h4>Subtask 3 - LabelPowerSet</h4>

In [46]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    LabelPowerset(svm.SVC(kernel='linear', probability=True))
)

In [47]:
t3lp_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3lp_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('labelpowerset',
                 LabelPowerset(classifier=SVC(kernel='linear',
                                              probability=True),
                               require_dense=[True, True]))])

In [48]:
predictions_t3lp = t3lp_classifier.predict(test_df["text"])
t3lp_score = accuracy_score(predictions_t3lp, test_df[t3_headers])
print("SVM subtask 3 LP Accuracy Score -> ",t3lp_score)

SVM subtask 3 LP Accuracy Score ->  0.48076256499133446


<h5>Export Model</h5>

In [49]:
filename = "t3lp_classifier.joblib.z"
path = out_directory + filename
dump(t3lp_classifier, path)

['../out/final/t3lp_classifier.joblib.z']

<h5>Tests</h5>

In [50]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3lp_classifier.predict_proba(arr).toarray()[0]
    pred = t3lp_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 15.667599094410086
  Religion: 8.381325610812459
  Gender: 16.04588312364875
  Other: 8.028950180578489
  None: 55.104405428643645
PREDICTION: [0 0 0 0 1]

INPUT: gay people are mentally ill
  Race: 21.349589591899104
  Religion: 20.735660752070554
  Gender: 29.017497385442738
  Other: 29.71870946931432
  None: 20.52853419764931
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 35.28292294841878
  Religion: 46.99384492497711
  Gender: 21.155485745426724
  Other: 21.80992867126253
  None: 14.68579928631919
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 56.86566367623418
  Religion: 15.532841925984709
  Gender: 7.961149226703382
  Other: 20.824768468654757
  None: 10.36084120148507
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 10.07132370340

In [54]:
labels = ["subtask 1", "subtask 2", "t3 OneVsRest", "t3 Binary Relevance", "t3 Classifier Chain", "t3 Label Powerset"]
scores = [t1_score, t2_score, t3_score, t3bc_score, t3cc_score, t3lp_score]
print("accuracies:")
for i,e in enumerate(labels):
    print(f"{scores[i]} - {e}")

accuracies:
0.32270363951473136 - subtask 1
0.8564991334488735 - subtask 2
0.2873483535528596 - t3 OneVsRest
0.2873483535528596 - t3 Binary Relevance
0.4856152512998267 - t3 Classifier Chain
0.48076256499133446 - t3 Label Powerset
