In [142]:
import pandas as pd
# Model
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [143]:
out_directory = "../out/true-final-3500/"

<h2>Organize Data</h2>

In [144]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

total = 3500
train_df = train_df.sample(n=int(total*.8))
val_df = val_df.sample(n=int(total*.1))
test_df = test_df.sample(n=int(total*.1))

In [184]:
train_df.to_csv('../res/train_final_3500.csv', index=False)
test_df.to_csv('../res/test_final_3500.csv', index=False)
val_df.to_csv('../res/val_final_3500.csv', index=False)

In [145]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
t2_headers = ["PRFN"]
t1_headers = t1a2_headers[:-1]
t3_headers = train_df.columns[5:]
print("subtask 1 headers:")
print(t1_headers)
print("subtask 2 headers:")
print(t2_headers)
print("subtask 3 headers:")
print(t3_headers)

subtask 1 headers:
Index(['HATE', 'NOT', 'OFFN'], dtype='object')
subtask 2 headers:
['PRFN']
subtask 3 headers:
Index(['Race', 'Religion', 'Gender', 'Other', 'None'], dtype='object')


In [146]:
# Make subtask 1 treat its labels as mutually exclusive by converting one hot encoded
# HATE NOT OFFN into one column
def det_class(vals):
    if vals[0] == 1:
        return "HATE"
    elif vals[1] == 1:
        return "NOT"
    else:
        return "OFFN"
train_df["subtask1"] = train_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
val_df["subtask1"] = val_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
test_df["subtask1"] = test_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)


<h2>Model Creation</h2>

<h4> Subtask 1 - OVR wrapper w/ ngrams</h4>

In [147]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [148]:
t1_classifier_ngram = ovr_classifier.fit(X=train_df["text"], y=train_df["subtask1"])
t1_classifier_ngram

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [149]:
predictions_t1 = t1_classifier_ngram.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df["subtask1"])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.5657142857142857


<h5>Export Model</h5>

In [150]:
filename = "t1ngram.joblib.z"
path = out_directory + filename
dump(t1_classifier_ngram, path)

['../out/true-final-3500/t1ngram.joblib.z']

<h4> Subtask 1 - OVR wrapper </h4>

In [151]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [152]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df["subtask1"])
t1_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [153]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df["subtask1"])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

SVM subtask 1 Accuracy Score ->  0.5685714285714286


<h5>Export Model</h5>

In [154]:
filename = "t1.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

['../out/true-final-3500/t1.joblib.z']

<h4>Subtask 2 w/ ngram</h4>

In [155]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    svm.SVC(kernel='linear', probability=True)
)

In [156]:
t2_classifier_ngram = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier_ngram

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('svc', SVC(kernel='linear', probability=True))])

In [157]:
predictions_t2 = t2_classifier_ngram.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.8657142857142858


<h5>Export Model</h5>

In [158]:
filename = "t2ngram.joblib.z"
path = out_directory + filename
dump(t2_classifier_ngram, path)

['../out/true-final-3500/t2ngram.joblib.z']

<h4>Subtask 2</h4>

In [159]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    svm.SVC(kernel='linear', probability=True)
)

In [160]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('svc', SVC(kernel='linear', probability=True))])

In [161]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

SVM subtask 2 Accuracy Score ->  0.86


<h5>Export Model</h5>

In [162]:
filename = "t2.joblib.z"
path = out_directory + filename
dump(t2_classifier, path)

['../out/true-final-3500/t2.joblib.z']

<h4>Subtask 3 - One VS Rest w ngram</h4>

In [163]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [164]:
t3_classifier_ngram = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier_ngram

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [165]:
predictions_t3 = t3_classifier_ngram.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.4742857142857143


<h5>Export Model</h5>

In [166]:
filename = "t3ovr_ngram.joblib.z"
path = out_directory + filename
dump(t3_classifier_ngram, path)

['../out/true-final-3500/t3ovr_ngram.joblib.z']

<h4>Subtask 3 - One VS Rest</h4>

In [167]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [168]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [169]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

SVM subtask 1 and 2 Accuracy Score ->  0.4742857142857143


<h5>Export Model</h5>

In [170]:
filename = "t3ovr.joblib.z"
path = out_directory + filename
dump(t3_classifier, path)

['../out/true-final-3500/t3ovr.joblib.z']

<h4>Subtask 3 - Binary Classifier</h4>

In [171]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)

In [172]:
t3bc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3bc_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('binaryrelevance',
                 BinaryRelevance(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [173]:
predictions_t3bc = t3bc_classifier.predict(test_df["text"])
t3bc_score = accuracy_score(predictions_t3bc, test_df[t3_headers])
print("SVM subtask 3 BC Accuracy Score -> ",t3bc_score)

SVM subtask 3 BC Accuracy Score ->  0.4742857142857143


<h5>Export Model</h5>

In [174]:
filename = "t3bc.joblib.z"
path = out_directory + filename
dump(t3bc_classifier, path)

['../out/true-final-3500/t3bc.joblib.z']

<h4>Subtask 3 - Classifier Chain</h4>

In [175]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)

In [176]:
t3cc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3cc_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('classifierchain',
                 ClassifierChain(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [177]:
predictions_t3cc = t3cc_classifier.predict(test_df["text"])
t3cc_score = accuracy_score(predictions_t3cc, test_df[t3_headers])
print("SVM subtask 3 CC Accuracy Score -> ",t3cc_score)

SVM subtask 3 CC Accuracy Score ->  0.5857142857142857


<h5>Export Model</h5>

In [178]:
filename = "t3cc.joblib.z"
path = out_directory + filename
dump(t3cc_classifier, path)

['../out/true-final-3500/t3cc.joblib.z']

<h4>Subtask 3 - LabelPowerSet</h4>

In [179]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    LabelPowerset(svm.SVC(kernel='linear', probability=True))
)

In [180]:
t3lp_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3lp_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('labelpowerset',
                 LabelPowerset(classifier=SVC(kernel='linear',
                                              probability=True),
                               require_dense=[True, True]))])

In [181]:
predictions_t3lp = t3lp_classifier.predict(test_df["text"])
t3lp_score = accuracy_score(predictions_t3lp, test_df[t3_headers])
print("SVM subtask 3 LP Accuracy Score -> ",t3lp_score)

SVM subtask 3 LP Accuracy Score ->  0.5771428571428572


<h5>Export Model</h5>

In [182]:
filename = "t3lp.joblib.z"
path = out_directory + filename
dump(t3lp_classifier, path)

['../out/true-final-3500/t3lp.joblib.z']

In [183]:
labels = ["subtask 1", "subtask 2", "t3 OneVsRest", "t3 Binary Relevance", "t3 Classifier Chain", "t3 Label Powerset"]
scores = [t1_score, t2_score, t3_score, t3bc_score, t3cc_score, t3lp_score]
print("accuracies:")
for i,e in enumerate(labels):
    print(f"{scores[i]} - {e}")

accuracies:
0.5685714285714286 - subtask 1
0.86 - subtask 2
0.4742857142857143 - t3 OneVsRest
0.4742857142857143 - t3 Binary Relevance
0.5857142857142857 - t3 Classifier Chain
0.5771428571428572 - t3 Label Powerset
