In [None]:
import pandas as pd
# Model
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [None]:
out_directory = "../out/final-final/"

<h2>Organize Data</h2>

In [1]:
# Read data
# df = pd.read_csv("../res/dataset-final.csv")

train_df = pd.read_csv("../res/train_final.csv")
val_df = pd.read_csv("../res/val_final.csv")
test_df = pd.read_csv("../res/test_final.csv")
train_df.head()

train_df = train_df.sample(n=500)
val_df = val_df.sample(n=63)
test_df = test_df.sample(n=63)

NameError: name 'pd' is not defined

In [None]:
# Organize targets
# Get target columns
t1a2_headers = train_df.columns[1:5]
t2_headers = ["PRFN"]
t1_headers = t1a2_headers[:-1]
t3_headers = train_df.columns[5:]
print("subtask 1 headers:")
print(t1_headers)
print("subtask 2 headers:")
print(t2_headers)
print("subtask 3 headers:")
print(t3_headers)

In [None]:
# Make subtask 1 treat its labels as mutually exclusive by converting one hot encoded
# HATE NOT OFFN into one column
def det_class(vals):
    if vals[0] == 1:
        return "HATE"
    elif vals[1] == 1:
        return "NOT"
    else:
        return "OFFN"
train_df["subtask1"] = train_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
val_df["subtask1"] = val_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)
test_df["subtask1"] = test_df.apply(lambda e: det_class([e[1],e[2],e[3]]), axis = 1)


<h2>Model Creation</h2>

<h4> Subtask 1 - OVR wrapper w/ ngrams</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df["subtask1"])
t1_classifier

In [None]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df["subtask1"])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

<h5>Export Model</h5>

In [None]:
filename = "t1ngram.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

<h4> Subtask 1 - OVR wrapper </h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t1_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df["subtask1"])
t1_classifier

In [None]:
predictions_t1 = t1_classifier.predict(test_df["text"])
t1_score = accuracy_score(predictions_t1, test_df["subtask1"])
print("SVM subtask 1 Accuracy Score -> ",t1_score)

<h5>Export Model</h5>

In [None]:
filename = "t1.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

<h4>Subtask 2 w/ ngram</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    svm.SVC(kernel='linear', probability=True)
)

In [None]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

In [None]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

<h5>Export Model</h5>

In [None]:
filename = "t2ngram.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

<h4>Subtask 2</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    svm.SVC(kernel='linear', probability=True)
)

In [None]:
t2_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t2_headers])
t2_classifier

In [None]:
predictions_t2 = t2_classifier.predict(test_df["text"])
t2_score = accuracy_score(predictions_t2, test_df[t2_headers])
print("SVM subtask 2 Accuracy Score -> ",t2_score)

<h5>Export Model</h5>

In [None]:
filename = "t2.joblib.z"
path = out_directory + filename
dump(t1_classifier, path)

<h4>Subtask 3 - One VS Rest w ngram</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

In [None]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

<h5>Export Model</h5>

In [None]:
filename = "t3ovr_ngram.joblib.z"
path = out_directory + filename
dump(t3_classifier, path)

<h4>Subtask 3 - One VS Rest</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3_classifier

In [None]:
predictions_t3 = t3_classifier.predict(test_df["text"])
t3_score = accuracy_score(predictions_t3, test_df[t3_headers])
print("SVM subtask 1 and 2 Accuracy Score -> ",t3_score)

<h5>Export Model</h5>

In [None]:
filename = "t3ovr.joblib.z"
path = out_directory + filename
dump(t3_classifier, path)

<h4>Subtask 3 - Binary Classifier</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3bc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3bc_classifier

In [None]:
predictions_t3bc = t3bc_classifier.predict(test_df["text"])
t3bc_score = accuracy_score(predictions_t3bc, test_df[t3_headers])
print("SVM subtask 3 BC Accuracy Score -> ",t3bc_score)

<h5>Export Model</h5>

In [None]:
filename = "t3bc.joblib.z"
path = out_directory + filename
dump(t3bc_classifier, path)

<h4>Subtask 3 - Classifier Chain</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(stop_words="english"),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3cc_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3cc_classifier

In [None]:
predictions_t3cc = t3cc_classifier.predict(test_df["text"])
t3cc_score = accuracy_score(predictions_t3cc, test_df[t3_headers])
print("SVM subtask 3 CC Accuracy Score -> ",t3cc_score)

<h5>Export Model</h5>

In [None]:
filename = "t3cc.joblib.z"
path = out_directory + filename
dump(t3cc_classifier, path)

<h4>Subtask 3 - LabelPowerSet</h4>

In [None]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    LabelPowerset(svm.SVC(kernel='linear', probability=True))
)

In [None]:
t3lp_classifier = ovr_classifier.fit(X=train_df["text"], y=train_df[t3_headers])
t3lp_classifier

In [None]:
predictions_t3lp = t3lp_classifier.predict(test_df["text"])
t3lp_score = accuracy_score(predictions_t3lp, test_df[t3_headers])
print("SVM subtask 3 LP Accuracy Score -> ",t3lp_score)

<h5>Export Model</h5>

In [None]:
filename = "t3lp.joblib.z"
path = out_directory + filename
dump(t3lp_classifier, path)

In [None]:
labels = ["subtask 1", "subtask 2", "t3 OneVsRest", "t3 Binary Relevance", "t3 Classifier Chain", "t3 Label Powerset"]
scores = [t1_score, t2_score, t3_score, t3bc_score, t3cc_score, t3lp_score]
print("accuracies:")
for i,e in enumerate(labels):
    print(f"{scores[i]} - {e}")