In [None]:
!pip3 install transformers
!pip3 install sentencepiece

In [72]:
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import BertTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [39]:
# Dataset
class BoWDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_set):
        self.tokenizer = tokenizer
        self.texts = input_set['text']
        self.labels = input_set['label']

        vectoriser = CountVectorizer(tokenizer=self.tokenizer.tokenize, max_features=10000)
        self.X = vectoriser.fit_transform(self.texts)
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {
            'text': self.X[idx],
            'label': self.labels[idx],
        }
        return item

In [42]:
# Experiment: Use BERT tokeniser or random tokeniser
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [33]:
train_df = pd.read_csv('../data/train.tsv',  sep='\t')
val_df = pd.read_csv('../data/valid.tsv',  sep='\t')
test_df = pd.read_csv('../data/dev.tsv',  sep='\t')

In [98]:
# Clean data
train_df = train_df.dropna()
val_df = val_df.dropna()
test_df = test_df.dropna()
print(f"Train dataset size: {len(train_df)}")
print(f"Validation dataset size: {len(val_df)}")
print(f"Test dataset size: {len(test_df)}")

Train dataset size: 6700
Validation dataset size: 1675
Test dataset size: 2093


## 1. Naive Bayes Classifier (Preprocessed with BoW and TF-IDF)

In [102]:
naive_bayes_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [84]:
train_X = train_df['text'].to_numpy('str')
train_Y = train_df['label'].to_numpy('int')
print(train_X.dtype, train_Y.dtype)

<U5493 int64


In [103]:
naive_bayes_clf.fit(train_X, train_Y)

In [96]:
# Load test datasets
test_X = test_df['text'].to_numpy('str')
test_Y = test_df['label'].to_numpy('int')
print(len(test_X))
print(len(test_Y))

2093
2093


In [99]:
print(len(test_df[test_df['label'] == 0]))
print(len(test_df[test_df['label'] == 1]))

1894
199


In [115]:
predicted = naive_bayes_clf.predict(test_X)

In [116]:
print(predicted.sum())
print(test_Y.sum())

0
199


In [117]:
# Evaluation Functions
# Based on https://github.com/Perez-AlmendrosC/dontpatronizeme/blob/master/semeval-2022/evaluation.py


def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


def precision(y_true, y_pred):
    return precision_score(y_true, y_pred)


def recall(y_true, y_pred):
    return recall_score(y_true, y_pred)


def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

In [118]:
# Evaluate Naive Bayes model.
print("Accuracy: ", accuracy(test_Y, predicted))
print("Precision: ", precision(test_Y, predicted))
print("Recall: ", recall(test_Y, predicted))
print("F1: ", f1(test_Y, predicted))

Accuracy:  0.904921165790731
Precision:  0.0
Recall:  0.0
F1:  0.0


  _warn_prf(average, modifier, msg_start, len(result))


# 2. Logistic Regression.

In [119]:
from sklearn.linear_model import LogisticRegression


logistic_regression_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

In [120]:
logistic_regression_clf.fit(train_X, train_Y)

In [121]:
predicted = logistic_regression_clf.predict(test_X)

In [122]:
# Evaluate Naive Bayes model.
print("Accuracy: ", accuracy(test_Y, predicted))
print("Precision: ", precision(test_Y, predicted))
print("Recall: ", recall(test_Y, predicted))
print("F1: ", f1(test_Y, predicted))

Accuracy:  0.904921165790731
Precision:  0.5
Recall:  0.01507537688442211
F1:  0.02926829268292683
