# Imports

In [38]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Define Naive Bayes Classifier

In [39]:
class NaiveBayes:
    """
    Naive Bayes classifier implementation.
    Parameters:
    - smoothing_factor (float): Smoothing factor for Laplace smoothing. Default is 1.0.
    Attributes:
    - smoothing_factor (float): Smoothing factor for Laplace smoothing.
    - class_priors (dict): Dictionary to store the prior probabilities of each class.
    - feature_likelihoods (dict): Dictionary to store the likelihood probabilities of each feature for each class.
    - classes (array-like): Array to store the unique classes.
    - num_features (int): Number of features in the dataset.
    Methods:
    - fit(X, y): Fit the Naive Bayes classifier to the training data.
    - predict(X): Predict the class labels for the given input data.
    - predict_instance(x): Predict the class label for a single instance.
    """
    def __init__(self, smoothing_factor=1.0):
        self.smoothing_factor = smoothing_factor
        self.class_priors = {}
        self.feature_likelihoods = {}
        self.classes = None
        self.num_features = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.num_features = X.shape[1]
        n_samples = X.shape[0]

        for c in self.classes:
            self.class_priors[c] = np.log(np.sum(y == c) / n_samples)

        for c in self.classes:
            class_samples = X[y == c]
            feature_count = np.sum(class_samples, axis=0) + self.smoothing_factor
            total_count = np.sum(feature_count)
            self.feature_likelihoods[c] = np.log(feature_count / total_count)

    def predict(self, X):
        return np.array([self.predict_instance(x) for x in X])

    def predict_instance(self, x):
        scores = {}
        for c in self.classes:
            scores[c] = self.class_priors[c] + np.sum(x * self.feature_likelihoods[c])
        return max(scores, key=scores.get)

# Define BPE Tokenizer

In [40]:
class BPETokenizer():
    class BPETokenizer:
        """
        BPE Tokenizer class for training and tokenizing text using Byte Pair Encoding (BPE) algorithm.
        Args:
            sentences (list): List of sentences to train the tokenizer.
            vocab_size (int): The desired size of the vocabulary.
        Attributes:
            sentences (list): List of sentences to train the tokenizer.
            vocab_size (int): The desired size of the vocabulary.
            word_freqs (defaultdict): Dictionary to store the frequency of each word.
            splits (dict): Dictionary to store the splits of each word.
            merges (dict): Dictionary to store the merges of word pairs.
            vocab (set): Set to store the unique tokens in the vocabulary.
        Methods:
            train(): Trains the tokenizer by processing the sentences and learning merges.
            compute_pair_freqs(): Computes the frequency of each word pair in the training data.
            apply_merge(): Applies a merge operation to the splits of each word.
            tokenize(text): Tokenizes the given text using the learned merges.
        """

    def __init__(self, sentences, vocab_size):
        self.sentences = sentences
        self.vocab_size = vocab_size
        self.word_freqs = defaultdict(int)
        self.splits = {}
        self.merges = {}
        self.vocab = set()

    def train(self):
        for sentence in tqdm(self.sentences, desc="Processing sentences"):
            for x in range(0, len(sentence)-1):
                word = sentence[x]
                if x >= 1:
                    word = '_' + word
                self.word_freqs[word] += 1
                self.splits[word] = list(word)
                self.vocab.update(list(word))

        pbar = tqdm(total=self.vocab_size - len(self.vocab), desc="Learning merges")
        while len(self.vocab) < self.vocab_size:
            pair_freqs = self.compute_pair_freqs()
            if not pair_freqs:
                break

            best_pair = max(pair_freqs, key=pair_freqs.get)
            new_token = ''.join(best_pair)
            self.merges[best_pair] = new_token
            self.vocab.add(new_token)

            self.apply_merge(best_pair, new_token)
            pbar.update(1)

        pbar.close()
        return self.merges

    def compute_pair_freqs(self):
        pair_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            split = self.splits[word]
            if len(split) == 1:
                continue
            for i in range(len(split) - 1):
                pair = (split[i], split[i + 1])
                pair_freqs[pair] += freq
        return pair_freqs

    def apply_merge(self, pair, new_token):
        for word in self.splits:
            split = self.splits[word]
            new_split = []
            i = 0
            while i < len(split):
                if i < len(split) - 1 and (split[i], split[i + 1]) == pair:
                    new_split.append(new_token)
                    i += 2
                else:
                    new_split.append(split[i])
                    i += 1
            self.splits[word] = new_split

    def tokenize(self, text):
        text = text.replace(' ', '_ ')
        tokens = list(text)
        for pair, merge in self.merges.items():
            i = 0
            while i < len(tokens) - 1:
                if tokens[i] == pair[0] and tokens[i + 1] == pair[1]:
                    tokens[i:i+2] = [merge]
                else:
                    i += 1
        return tokens

# One hot encoding

In [41]:
def one_hot_vectorize(sentences, type2index):
    vocab_size = len(type2index)
    one_hot_sentences = []
    for sentence in tqdm(sentences, desc="Creating one-hot vectors"):
        one_hot_sentence = np.zeros(vocab_size)
        for word in sentence:
            if word in type2index:
                one_hot_sentence[type2index[word]] = 1
        one_hot_sentences.append(one_hot_sentence)
    one_hot_sentences = np.array(one_hot_sentences)
    return one_hot_sentences

# Helper functions

In [42]:
def train_classifier(X_train, y_train):
    print("Training classifier...")
    clf = NaiveBayes()
    clf.fit(X_train, y_train)
    return clf

def evaluate_model(clf, X_test, y_test):
    print("Evaluating model...")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

def clean(text):
    #text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'http\S+', '[URL]', text) # Replace URLS with [URL]
    text = re.sub(r'\d+', '[NUM]', text) # Replace numbers with [NUM]
    text = text.strip() # Remove trailing spaces
    return text.lower()

def whitespace_tokenize(corpus):
    return [sentence.split() for sentence in corpus]

In [43]:
class whitespace_tokenizer:
    """
    A tokenizer that splits text into tokens based on whitespace.
    Args:
        corpus (list): A list of sentences to build the vocabulary from.
    Attributes:
        corpus (list): The input corpus of sentences.
        vocab (set): The set of unique words in the corpus.
    Methods:
        tokenize(text): Tokenizes the input text into a list of tokens.
    Example:
        corpus = ["Hello world", "This is a sentence"]
        tokenizer = whitespace_tokenizer(corpus)
        tokens = tokenizer.tokenize("Hello world")
        print(tokens)  # Output: ['Hello', 'world']
    """
    def __init__(self, corpus):
        self.corpus = corpus
        self.vocab = set()
        for sentence in corpus:
            for word in sentence.split():
                self.vocab.add(word)
        
    def tokenize(self, text):
        return text.split()

# Train and evaluate the model

In [44]:
def sentiment_analysis(train_df, dev_df, test_df, tokenizer):
    """
    Perform sentiment analysis using a given tokenizer on the provided datasets.
    Args:
        train_df (pandas.DataFrame): The training dataset containing 'text' and 'label' columns.
        dev_df (pandas.DataFrame): The development dataset containing 'text' and 'label' columns.
        test_df (pandas.DataFrame): The test dataset containing 'text' and 'label' columns.
        tokenizer: The tokenizer object used to tokenize the texts.
    Returns:
        tuple: A tuple containing the trained classifier and the type-to-index dictionary.
    Raises:
        None
    Example:
        train_df = pd.read_csv('train.csv')
        dev_df = pd.read_csv('dev.csv')
        test_df = pd.read_csv('test.csv')
        tokenizer = Tokenizer()
        clf, type2index = sentiment_analysis(train_df, dev_df, test_df, tokenizer)
    """
    # Create type2index dictionary
    type2index = {token: idx for idx, token in enumerate(tokenizer.vocab)}
    
    # Tokenize texts
    train_tokenized = train_df['text'].progress_apply(lambda x: tokenizer.tokenize(x))
    dev_tokenized = dev_df['text'].progress_apply(lambda x: tokenizer.tokenize(x))
    test_tokenized = test_df['text'].progress_apply(lambda x: tokenizer.tokenize(x))
    
    # Create feature vectors
    X_train = one_hot_vectorize(train_tokenized, type2index)
    X_dev = one_hot_vectorize(dev_tokenized, type2index)
    X_test = one_hot_vectorize(test_tokenized, type2index)
    
    # Prepare labels
    y_train = (train_df['label'] == 'positive').astype(int)
    y_dev = (dev_df['label'] == 'positive').astype(int)
    y_test = (test_df['label'] == 'positive').astype(int)
    
    # Train classifier
    clf = train_classifier(X_train, y_train)
    
    # Evaluate model on dev set
    dev_accuracy, dev_report = evaluate_model(clf, X_dev, y_dev)
    print("Dev Set Results:")
    print(f"Accuracy: {dev_accuracy}")
    print("Classification Report:")
    print(dev_report)
    
    return clf, type2index

# Prepare dataset

In [45]:
# Set up paths
PROJECT_DIR = os.getcwd() + '/afrisent-semeval-2023'
language = 'hau'
DATA_DIR = f'{PROJECT_DIR}/data/{language}'

# Load data
print("Loading data...")
train_df = pd.read_csv(f'{DATA_DIR}/train.tsv', sep='\t', names=['text', 'label'], header=0)
dev_df = pd.read_csv(f'{DATA_DIR}/dev.tsv', sep='\t', names=['text', 'label'], header=0)
test_df = pd.read_csv(f'{DATA_DIR}/test.tsv', sep='\t', names=['text', 'label'], header=0)

train_df = train_df[train_df['label'] != 'neutral']
dev_df = dev_df[dev_df['label'] != 'neutral']
test_df = test_df[test_df['label'] != 'neutral']

# Preprocess data
for df in [train_df, dev_df, test_df]:
    tqdm.pandas(desc="Cleaning text")
    df['text'] = df['text'].progress_apply(clean)
    
# Prepare corpus for BPE
train_corpus = train_df['text'].tolist()
tokenized_train_corpus = whitespace_tokenize(train_corpus)

Loading data...


Cleaning text: 100%|██████████| 9260/9260 [00:00<00:00, 223216.68it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:00<00:00, 232364.55it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:00<00:00, 306241.36it/s]


In [46]:
# White space tokenizer
whitespace_tokenizer = whitespace_tokenizer(train_corpus)

# Run sentiment analysis
clf, type2index = sentiment_analysis(train_df, dev_df, test_df, whitespace_tokenizer)

Cleaning text: 100%|██████████| 9260/9260 [00:00<00:00, 428613.66it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:00<00:00, 592250.49it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:00<00:00, 407148.74it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 13787.70it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 13487.85it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 12302.43it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8674901740595171
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       894
           1       0.90      0.83      0.86       887

    accuracy                           0.87      1781
   macro avg       0.87      0.87      0.87      1781
weighted avg       0.87      0.87      0.87      1781



In [47]:
# Train BPE
bpe = BPETokenizer(tokenized_train_corpus, vocab_size=1000)
merges = bpe.train()

# Run sentiment analysis
clf, type2index = sentiment_analysis(train_df, dev_df, test_df, bpe)

Training BPE tokenizer...


Processing sentences: 100%|██████████| 9260/9260 [00:00<00:00, 57303.12it/s]
Learning merges: 100%|██████████| 507/507 [00:30<00:00, 16.81it/s]
Cleaning text: 100%|██████████| 9260/9260 [00:21<00:00, 425.07it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:04<00:00, 369.52it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:07<00:00, 450.21it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 99741.79it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 139954.20it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 119777.85it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8512071869736103
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       894
           1       0.90      0.79      0.84       887

    accuracy                           0.85      1781
   macro avg       0.86      0.85      0.85      1781
weighted avg       0.86      0.85      0.85      1781



In [48]:
for x in [1000, 2000, 5000, 10000]:
    # Train BPE
    bpe = BPETokenizer(tokenized_train_corpus, vocab_size=x)
    merges = bpe.train()

    # Run sentiment analysis
    clf, type2index = sentiment_analysis(train_df, dev_df, test_df, bpe)

Training BPE tokenizer...


Processing sentences: 100%|██████████| 9260/9260 [00:00<00:00, 61701.91it/s]
Learning merges: 100%|██████████| 507/507 [00:33<00:00, 15.33it/s]
Cleaning text: 100%|██████████| 9260/9260 [00:21<00:00, 430.34it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:04<00:00, 391.47it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:07<00:00, 475.37it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 112109.29it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 115635.53it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 132554.94it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8512071869736103
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       894
           1       0.90      0.79      0.84       887

    accuracy                           0.85      1781
   macro avg       0.86      0.85      0.85      1781
weighted avg       0.86      0.85      0.85      1781

Training BPE tokenizer...


Processing sentences: 100%|██████████| 9260/9260 [00:00<00:00, 48858.40it/s]
Learning merges: 100%|██████████| 1507/1507 [01:35<00:00, 15.71it/s]
Cleaning text: 100%|██████████| 9260/9260 [00:59<00:00, 154.49it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:12<00:00, 142.62it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:20<00:00, 170.92it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 84648.83it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 117147.94it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 129502.28it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8500842223469961
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       894
           1       0.89      0.80      0.84       887

    accuracy                           0.85      1781
   macro avg       0.85      0.85      0.85      1781
weighted avg       0.85      0.85      0.85      1781

Training BPE tokenizer...


Processing sentences: 100%|██████████| 9260/9260 [00:00<00:00, 65444.68it/s]
Learning merges: 100%|██████████| 4507/4507 [04:57<00:00, 15.17it/s]
Cleaning text: 100%|██████████| 9260/9260 [02:50<00:00, 54.28it/s]
Cleaning text: 100%|██████████| 1781/1781 [00:35<00:00, 49.90it/s]
Cleaning text: 100%|██████████| 3514/3514 [00:59<00:00, 59.55it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 49433.37it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 44741.86it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 57428.46it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8618753509264458
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       894
           1       0.90      0.81      0.85       887

    accuracy                           0.86      1781
   macro avg       0.87      0.86      0.86      1781
weighted avg       0.87      0.86      0.86      1781

Training BPE tokenizer...


Processing sentences: 100%|██████████| 9260/9260 [00:00<00:00, 36310.38it/s]
Learning merges: 100%|██████████| 9507/9507 [10:04<00:00, 15.72it/s]
Cleaning text: 100%|██████████| 9260/9260 [05:21<00:00, 28.76it/s]
Cleaning text: 100%|██████████| 1781/1781 [01:06<00:00, 26.83it/s]
Cleaning text: 100%|██████████| 3514/3514 [01:59<00:00, 29.47it/s]
Creating one-hot vectors: 100%|██████████| 9260/9260 [00:00<00:00, 32696.55it/s]
Creating one-hot vectors: 100%|██████████| 1781/1781 [00:00<00:00, 27581.06it/s]
Creating one-hot vectors: 100%|██████████| 3514/3514 [00:00<00:00, 32785.71it/s]


Training classifier...
Evaluating model...
Dev Set Results:
Accuracy: 0.8663672094329029
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       894
           1       0.90      0.82      0.86       887

    accuracy                           0.87      1781
   macro avg       0.87      0.87      0.87      1781
weighted avg       0.87      0.87      0.87      1781

