In [24]:
import pandas as pd
# Model
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [25]:
out_directory = "../out/final/"

<h2>Organize Data</h2>

In [26]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None
0,you just do not get it next time you are relyi...,0,0,1,1,0,0,0,1,0
1,it takes a certain kind of stupid to believe s...,0,1,0,1,0,0,1,0,0
2,the looney left liberal sjw cuck a sad delusio...,1,0,0,1,0,0,0,1,0
3,blacks are only of the yous population therefo...,0,0,1,0,1,0,0,0,0
4,its the islamic signal they are ready and will...,1,0,0,1,0,1,0,0,0


In [27]:
# # Organize targets
# # Get target columns
# headers = train_df.columns[1:]
# # Create separate df for targets
# y_train = train_df[headers]
# y_val = val_df[headers]
# y_test = test_df[headers]

In [28]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
print("subtask 1 and 2 headers:")
print(t1a2_headers)
t3_headers = train_df.columns[5:]
print("subtask 3 headers:")
print(t3_headers)

subtask 1 and 2 headers:
Index(['HATE', 'NOT', 'OFFN', 'PRFN'], dtype='object')
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


In [29]:
# Make subtask 1 treat its labels as mutually exclusive by converting one hot encoded
# HATE NOT OFFN into one column
def det_class(vals):
    if vals[0] == 1:
        return "HATE"
    elif vals[1] == 1:
        return "NOT"
    else:
        return "OFFN"
train_df["subtask1"] = train_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
val_df["subtask1"] = val_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
test_df["subtask1"] = test_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)


<h2>Model Creation</h2>

<h4> Subtask 1 - OVR wrapper </h4>

In [30]:
# Set number of samples to train with
# train_df = train_df.iloc[:500]

In [31]:
train_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None,subtask1
0,you just do not get it next time you are relyi...,0,0,1,1,0,0,0,1,0,OFFN
1,it takes a certain kind of stupid to believe s...,0,1,0,1,0,0,1,0,0,NOT
2,the looney left liberal sjw cuck a sad delusio...,1,0,0,1,0,0,0,1,0,HATE
3,blacks are only of the yous population therefo...,0,0,1,0,1,0,0,0,0,OFFN
4,its the islamic signal they are ready and will...,1,0,0,1,0,1,0,0,0,HATE


In [32]:
t1_headers = t1a2_headers[:-1]
t1_headers

Index(['HATE', 'NOT', 'OFFN'], dtype='object')

In [33]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [34]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df["subtask1"])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [35]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df["subtask1"])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.6568457538994801


<h5>Export Model</h5>

In [36]:
filename = "t1_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final/t1_classifier.joblib.z']

<h5>Tests</h5>

In [37]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t1_classifier.classes_
    proba = t1_classifier.predict_proba(arr)[0]
    pred = t1_classifier.predict(arr)[0]

    headers = t1_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  HATE: 9.659057556831858
  NOT: 27.75550812281356
  OFFN: 62.585434320354594
PREDICTION: OFFN

INPUT: gay people are mentally ill
  HATE: 23.230039581585302
  NOT: 64.1930241444039
  OFFN: 12.576936274010775
PREDICTION: NOT

INPUT: Islam people are all terrorists
  HATE: 42.562990436538975
  NOT: 23.037367294286398
  OFFN: 34.39964226917462
PREDICTION: HATE

INPUT: Asians should just go back to their country, all they do is take our jobs
  HATE: 40.457485013275026
  NOT: 39.42592795898445
  OFFN: 20.116587027740525
PREDICTION: HATE

INPUT: If you want to make the world a better place, look at yourself and make a change
  HATE: 9.954450796350864
  NOT: 59.35407761532565
  OFFN: 30.69147158832348
PREDICTION: NOT



In [38]:
sdglksgklsjag stop here 

SyntaxError: invalid syntax (3530142609.py, line 1)

<h4>Subtask 2</h4>

In [None]:
t2_headers = ["PRFN"]

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    svm.SVC(kernel='linear', probability=True)
)

In [None]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('svc', SVC(kernel='linear', probability=True))])

In [None]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.8918544194107453


<h5>Export Model</h5>

In [None]:
filename = "t2_classifier.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/final/t2_classifier.joblib.z']

<h5>Tests</h5>

In [None]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t2_classifier.classes_
    proba = t2_classifier.predict_proba(arr)[0]
    pred = t2_classifier.predict(arr)[0]

    headers = classes

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  0: 1.3704278319750957e-10
  1: 99.99999999986295
PREDICTION: 1

INPUT: gay people are mentally ill
  0: 8.713404416906598e-05
  1: 99.99991286595582
PREDICTION: 1

INPUT: Islam people are all terrorists
  0: 79.63866943409144
  1: 20.361330565908574
PREDICTION: 0

INPUT: Asians should just go back to their country, all they do is take our jobs
  0: 2.3670969395954167
  1: 97.63290306040456
PREDICTION: 1

INPUT: If you want to make the world a better place, look at yourself and make a change
  0: 89.72826370030427
  1: 10.27173629969573
PREDICTION: 0



<h4>Subtask 3 - One VS Rest</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [None]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.549740034662045


<h5>Export Model</h5>

In [None]:
filename = "t3ovr_classifier.joblib.z"
path = out_directory + filename
dump(t3_classifier, path)

['../out/final/t3ovr_classifier.joblib.z']

<h5>Tests</h5>

In [None]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    classes = t3_classifier.classes_
    proba = t3_classifier.predict_proba(arr)[0]
    pred = t3_classifier.predict(arr)[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 11.75379719431163
  Religion: 6.421644202373381
  Gender: 67.27895515428628
  Other: 1.6924467656948243
  None: 9.065448674904832
PREDICTION: [0 0 1 0 0]

INPUT: gay people are mentally ill
  Race: 1.6919530235211915
  Religion: 0.67731452007853
  Gender: 97.82626125983933
  Other: 4.435847976874744
  None: 12.679742290302098
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 1.7058264285510345
  Religion: 99.9999759092002
  Gender: 0.5097839942267998
  Other: 4.144152075635503
  None: 4.558307230786959
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 86.50984729448147
  Religion: 5.194768774555001
  Gender: 23.13596108545355
  Other: 14.664750291608827
  None: 4.849081292194469
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 7.572798455385815


<h4>Subtask 3 - Binary Classifier</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3bc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3bc_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('binaryrelevance',
                 BinaryRelevance(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [None]:
predictions_t3bc = t3bc_classifier.predict(test_df["text"])
t3bc_score = accuracy_score(predictions_t3bc, test_df[t3_headers])
print("SVM subtask 3 BC Accuracy Score -> ",t3bc_score)

SVM subtask 3 BC Accuracy Score ->  0.549740034662045


<h5>Export Model</h5>

In [None]:
filename = "t3bc_classifier.joblib.z"
path = out_directory + filename
dump(t3bc_classifier, path)

['../out/final/t3bc_classifier.joblib.z']

<h5>Tests</h5>

In [None]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3bc_classifier.predict_proba(arr).toarray()[0]
    pred = t3bc_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 11.676843658330045
  Religion: 6.4020767269985654
  Gender: 67.24992593486961
  Other: 1.6447626732104272
  None: 9.017671816443542
PREDICTION: [0 0 1 0 0]

INPUT: gay people are mentally ill
  Race: 1.7016277251102447
  Religion: 0.6717252294114877
  Gender: 97.79678450863238
  Other: 4.369544893128461
  None: 12.626230438482919
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 1.715490638313218
  Religion: 99.9999768678945
  Gender: 0.5215949327349715
  Other: 4.078265332744945
  None: 4.523533847755687
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 86.12927907625367
  Religion: 5.1753229724840395
  Gender: 23.250704645850096
  Other: 14.704214593581266
  None: 4.812937480265837
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 7.54247969006

<h4>Subtask 3 - Classifier Chain</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3cc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3cc_classifier

MemoryError: Unable to allocate 2.26 GiB for an array with shape (13460, 22537) and data type float64

In [None]:
predictions_t3cc = t3cc_classifier.predict(test_df["text"])
t3cc_score = accuracy_score(predictions_t3cc, test_df[t3_headers])
print("SVM subtask 3 CC Accuracy Score -> ",t3cc_score)

SVM subtask 3 CC Accuracy Score ->  0.4856152512998267


<h5>Export Model</h5>

In [None]:
filename = "t3cc_classifier.joblib.z"
path = out_directory + filename
dump(t3cc_classifier, path)

['../out/final/t3cc_classifier.joblib.z']

<h5>Tests</h5>

In [None]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3cc_classifier.predict_proba(arr).toarray()[0]
    pred = t3cc_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 7.983149489163929
  Religion: 3.7705484616404883
  Gender: 12.065269078684155
  Other: 4.832572138522008
  None: 84.91721629850211
PREDICTION: [0. 0. 0. 0. 1.]

INPUT: gay people are mentally ill
  Race: 1.7170055305583471
  Religion: 16.773442737502172
  Gender: 92.09872058735607
  Other: 35.40592708898736
  None: 0.7185430973270114
PREDICTION: [0. 0. 1. 0. 0.]

INPUT: Islam people are all terrorists
  Race: 5.6334441030530575
  Religion: 99.99973683060782
  Gender: 1.078511022639815
  Other: 16.93588355531013
  None: 0.8550955775487236
PREDICTION: [0. 1. 0. 0. 0.]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 62.53570350623865
  Religion: 5.505663900768042
  Gender: 5.337683597493268
  Other: 44.908840148292086
  None: 84.67591306869883
PREDICTION: [0. 0. 0. 0. 1.]

INPUT: If you want to make the world a better place, look at yourself and make a change
  

<h4>Subtask 3 - LabelPowerSet</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    LabelPowerset(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3lp_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3lp_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('labelpowerset',
                 LabelPowerset(classifier=SVC(kernel='linear',
                                              probability=True),
                               require_dense=[True, True]))])

In [None]:
predictions_t3lp = t3lp_classifier.predict(test_df["text"])
t3lp_score = accuracy_score(predictions_t3lp, test_df[t3_headers])
print("SVM subtask 3 LP Accuracy Score -> ",t3lp_score)

SVM subtask 3 LP Accuracy Score ->  0.48076256499133446


<h5>Export Model</h5>

In [None]:
filename = "t3lp_classifier.joblib.z"
path = out_directory + filename
dump(t3lp_classifier, path)

['../out/final/t3lp_classifier.joblib.z']

<h5>Tests</h5>

In [None]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [text]
    proba = t3lp_classifier.predict_proba(arr).toarray()[0]
    pred = t3lp_classifier.predict(arr).toarray()[0]

    headers = t3_headers

    print(f'INPUT: {text}')
    for i,e in enumerate(headers):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Race: 15.667599094410086
  Religion: 8.381325610812459
  Gender: 16.04588312364875
  Other: 8.028950180578489
  None: 55.104405428643645
PREDICTION: [0 0 0 0 1]

INPUT: gay people are mentally ill
  Race: 21.349589591899104
  Religion: 20.735660752070554
  Gender: 29.017497385442738
  Other: 29.71870946931432
  None: 20.52853419764931
PREDICTION: [0 0 1 0 0]

INPUT: Islam people are all terrorists
  Race: 35.28292294841878
  Religion: 46.99384492497711
  Gender: 21.155485745426724
  Other: 21.80992867126253
  None: 14.68579928631919
PREDICTION: [0 1 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Race: 56.86566367623418
  Religion: 15.532841925984709
  Gender: 7.961149226703382
  Other: 20.824768468654757
  None: 10.36084120148507
PREDICTION: [1 0 0 0 0]

INPUT: If you want to make the world a better place, look at yourself and make a change
  Race: 10.07132370340

In [None]:
labels = ["subtask 1", "subtask 2", "t3 OneVsRest", "t3 Binary Relevance", "t3 Classifier Chain", "t3 Label Powerset"]
scores = [t1_score, t2_score, t3_score, t3bc_score, t3cc_score, t3lp_score]
print("accuracies:")
for i,e in enumerate(labels):
    print(f"{scores[i]} - {e}")

accuracies:
0.32270363951473136 - subtask 1
0.8564991334488735 - subtask 2
0.2873483535528596 - t3 OneVsRest
0.2873483535528596 - t3 Binary Relevance
0.4856152512998267 - t3 Classifier Chain
0.48076256499133446 - t3 Label Powerset
