In [None]:
import joblib
import pandas as pd
import torch

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from prettytable import PrettyTable


from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.multiclass import OneVsRestClassifier
from torcheval.metrics import MultilabelAccuracy


## Loading data from before

In [None]:
train_df = joblib.load('data/train_df.joblib')
test_df = joblib.load('data/test_df.joblib')

In [None]:
vectorizer = joblib.load('data/vectorizer.joblib')
vectorised_train_text = joblib.load('data/vectorised_train_text.joblib')
vectorised_test_text =  joblib.load('data/vectorised_test_text.joblib')

In [None]:
def model_metrics(test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    exact_accuracy = MultilabelAccuracy(criteria='exact_match')
    exact_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    overlap_accuracy = MultilabelAccuracy(criteria='overlap')
    overlap_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')

    metrics = {}
    metrics = {
        'accuracy' : accuracy,
        'exact_match_accuracy': exact_accuracy.compute().detach().item(),
        'overlap_accuracy': overlap_accuracy.compute().detach().item(),
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    }

    return metrics

In [None]:
def pretty_table(dict):
    table = PrettyTable()
    table.field_names = ['metric', 'value']
    for k,v in dict.items():
        table.add_row([k, v])
    print(table)

# SVM model
**SVM classifier donot support multilabel natively, so we have to use one vs rest or one vs one**
- https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/

### Category classification

In [None]:
svmClassifier_category = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier_category.fit(vectorised_train_text, train_df['category_labels'].to_list())

svmPreds_category = svmClassifier_category.predict(vectorised_test_text)
svm_category_metrics = model_metrics(test_df['category_labels'].to_list(), svmPreds_category)

pretty_table(svm_category_metrics)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.684913217623498  |
| exact_match_accuracy | 0.6849132180213928 |
|   overlap_accuracy   | 0.8264352679252625 |
|       macro_f1       | 0.7949338277643552 |
|       micro_f1       | 0.8189066059225513 |
+----------------------+--------------------+


## Sentiment classification

In [None]:
svmClassifier_polarity = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier_polarity.fit(vectorised_train_text, train_df['polarity_labels'].to_list())

svmPreds_polarity = svmClassifier_polarity.predict(vectorised_test_text)
svm_polarity_metrics = model_metrics(test_df['polarity_labels'].to_list(), svmPreds_polarity)

pretty_table(svm_polarity_metrics)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.6862483311081442 |
| exact_match_accuracy | 0.6862483024597168 |
|   overlap_accuracy   | 0.7383177280426025 |
|       macro_f1       | 0.6153727514585684 |
|       micro_f1       | 0.7510148849797024 |
+----------------------+--------------------+


## Combined classification

In [None]:
svmClassifier_joint = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier_joint.fit(vectorised_train_text, train_df['joint_labels'].to_list())

svmPreds_joint = svmClassifier_joint.predict(vectorised_test_text)
svm_joint_metrics = model_metrics(test_df['joint_labels'].to_list(), svmPreds_joint)

pretty_table(svm_joint_metrics)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.4205607476635514 |
| exact_match_accuracy | 0.420560747385025  |
|   overlap_accuracy   | 0.5447263121604919 |
|       macro_f1       | 0.3609738235138345 |
|       micro_f1       | 0.6040868454661558 |
+----------------------+--------------------+


# Gradient Boosting

## Category classification

In [None]:
boostingClassifier_category = OneVsRestClassifier(GradientBoostingClassifier())
boostingClassifier_category.fit(vectorised_train_text, train_df['category_labels'].to_list())

boostingPreds_category = boostingClassifier_category.predict(vectorised_test_text)
boosting_category_metrics = model_metrics(test_df['category_labels'].to_list(), boostingPreds_category)

pretty_table(boosting_category_metrics)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.514018691588785  |
| exact_match_accuracy | 0.514018714427948  |
|   overlap_accuracy   | 0.643524706363678  |
|       macro_f1       | 0.7425315862066151 |
|       micro_f1       | 0.733374766935985  |
+----------------------+--------------------+


## Sentiment classification

In [None]:
boostingClassifier_polarity = OneVsRestClassifier(GradientBoostingClassifier())
boostingClassifier_polarity.fit(vectorised_train_text, train_df['polarity_labels'].to_list())

boostingPreds_polarity = boostingClassifier_polarity.predict(vectorised_test_text)
boosting_polarity_metrics = model_metrics(test_df['polarity_labels'].to_list(), boostingPreds_polarity)

pretty_table(boosting_polarity_metrics)

+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.650200267022697  |
| exact_match_accuracy | 0.6502002477645874 |
|   overlap_accuracy   | 0.6969292163848877 |
|       macro_f1       | 0.4345677354373006 |
|       micro_f1       | 0.7075471698113208 |
+----------------------+--------------------+


## Combined classification

In [None]:
boostingClassifier_joint = OneVsRestClassifier(GradientBoostingClassifier())
boostingClassifier_joint.fit(vectorised_train_text, train_df['joint_labels'].to_list())

boostingPreds_joint = boostingClassifier_joint.predict(vectorised_test_text)
boosting_joint_metrics = model_metrics(test_df['joint_labels'].to_list(), boostingPreds_joint)

pretty_table(boosting_joint_metrics)

+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.27102803738317754 |
| exact_match_accuracy |  0.2710280418395996 |
|   overlap_accuracy   |  0.3898531496524811 |
|       macro_f1       | 0.28963520863345926 |
|       micro_f1       |  0.502415458937198  |
+----------------------+---------------------+
