# NLP_ Assignment 1

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from nltk import bigrams
from itertools import count
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
#Loading the Dataset
dataset = load_dataset('sst', trust_remote_code=True)

sst.py:   0%|          | 0.00/9.13k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.68k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/790k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

# Prepare dataset

In [3]:
def map_labels(df):
    df['label'] = pd.cut(
        df['label'],
        bins=[-float('inf'), 0.2, 0.4, 0.6, 0.8, float('inf')],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)

In [4]:
# Converting dataset into dataframs
train_df = pd.DataFrame(dataset['train']).drop('tree', axis=1)
test_df = pd.DataFrame(dataset['test']).drop('tree', axis=1)
validation_df = pd.DataFrame(dataset['validation']).drop('tree', axis=1)

# Mapping labels from real values to 5 classes
map_labels(train_df)
map_labels(test_df)
map_labels(validation_df)

In [5]:
train_df.head()

Unnamed: 0,sentence,label,tokens
0,The Rock is destined to be the 21st Century 's...,3,The|Rock|is|destined|to|be|the|21st|Century|'s...
1,The gorgeously elaborate continuation of `` Th...,4,The|gorgeously|elaborate|continuation|of|``|Th...
2,Singer\/composer Bryan Adams contributes a sle...,3,Singer\/composer|Bryan|Adams|contributes|a|sle...
3,You 'd think by now America would have had eno...,2,You|'d|think|by|now|America|would|have|had|eno...
4,Yet the act is still charming here .,3,Yet|the|act|is|still|charming|here|.


# Part 1: Naïve Bayes

In this implementation we apply vocab filtering, to remove stopwords which doesnt have effect in context. Considered as noise reduction in the dataset.

Also removing words that rarely appear (Less than 3 times) as it doesnt contribute much to the classification.

In [6]:
STOPWORDS = set(["the", "is", "in", "and", "to", "of", "a", "that", "it", "on", "for", "with", "as", "by", "at", "from"])
MIN_FREQUENCY = 3

def get_filtered_vocab(df):
    all_words = []
    for text in df['sentence']:
        all_words.extend(text.split(" "))


    all_words = np.array(all_words)
    unique_words, counts = np.unique(all_words, return_counts=True)
    vocab = {word for word, count in zip(unique_words, counts) if count >= MIN_FREQUENCY and word not in STOPWORDS}

    return vocab



def get_words_count(df):
    word_count = {}
    for text in df['sentence']:
        for word in text.split(" "):
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count



def train_naive_bayes(train_df):
    n_doc = train_df.shape[0]
    vocab = get_filtered_vocab(train_df)
    log_prior = []
    log_likelihood = [{} for _ in range(5)]


    for c in range(5):
        # Calculate p(c)
        n_c = train_df[train_df['label'] == c].shape[0]
        log_prior.append(np.log(n_c / n_doc))

        # Calculate p(w | c)
        words_count_in_c = get_words_count(train_df[train_df['label'] == c])
        all_words_count = sum(words_count_in_c.values())

        for w in vocab:
            # Apply Laplace smoothing with a value of 0.6
            log_likelihood[c][w] = np.log((words_count_in_c.get(w, 0) + 0.6) / (all_words_count + len(vocab)))

    return log_prior, log_likelihood, vocab



def test_naive_bayes(test_doc, log_prior, log_likelihood, vocab):
    sum = []
    for c in range(5):
        sum.append(log_prior[c])
        for w in test_doc.split(" "):
            if w in vocab:
                sum[c] += log_likelihood[c][w]

    return np.argmax(sum)


In [7]:
log_prior, log_likelihood, vocab = train_naive_bayes(train_df)

print(test_naive_bayes("I love this movie", log_prior, log_likelihood, vocab))
print(test_naive_bayes("I hate this movie", log_prior, log_likelihood, vocab))

3
1


## Naive Bayes Accuracy

By tunning the smoothing parameter alpha and the MIN_FREQUENCY threshold we get accuracy exactly like Scikit-learn.

In [8]:
count = 0

for i in range(test_df.shape[0]):
  if test_naive_bayes(test_df.iloc[i]['sentence'], log_prior, log_likelihood, vocab) == test_df.iloc[i]['label']:
    count += 1

print("Accuracy: ", count / test_df.shape[0])

Accuracy:  0.4090497737556561


## Scikit Learn Accuracy for Naive Bayes

In [9]:
pipeline = Pipeline([

    ('vectorizer', CountVectorizer()),

    ('classifier', MultinomialNB())

])


pipeline.fit(train_df['sentence'], train_df['label'])
predictions = pipeline.predict(test_df['sentence'])
accuracy = accuracy_score(test_df['label'], predictions)

print("Accuracy:", accuracy)

Accuracy: 0.4090497737556561


# Part 2: Logistic Regression

In [10]:
training_set =  dataset['train']
validation_set = dataset['validation']
test_set = dataset['test']



training_labels = np.array(training_set['label'])
validation_labels = np.array(validation_set['label'])
test_labels = np.array(test_set['label'])

In [13]:
bigram_map = {}
index_counter = count(0)

for entry in training_set:
    sentence = entry['sentence']
    # Generate bigrams from the sentence
    for bigram in bigrams(sentence.split()):
        # If the bigram is not already in the map, add it with a new index
        if bigram not in bigram_map:
            bigram_map[bigram] = next(index_counter)

In [14]:
vec_size = len(bigram_map)
print(vec_size)

87249


In [15]:
def Features_extractor(sentence):
    vector = np.zeros(vec_size)
    for bigram in bigrams(sentence.split()):
        # If the bigram is not already in the map, add it with a new index
        if bigram in bigram_map:
            vector[bigram_map[bigram]] = 1

    return vector

def Data_Generator(dataset):
    vectorized_set = np.zeros((vec_size, len(dataset)))
    for i, entry in enumerate(dataset):
        sentence = entry['sentence']
        vector = Features_extractor(sentence)  # Get the feature vector
        vectorized_set[:, i] = vector  # Assign vector to the i-th column of vectorized_set

    return vectorized_set



def Labels_encoder(Y_labels, K):
    class_labels = np.digitize(Y_labels, [0.2, 0.4, 0.6, 0.8, 1.0]) - 1
    N = class_labels.shape[0]
    Y_one_hot = np.zeros((K, N))
    Y_one_hot[class_labels, np.arange(N)] = 1
    return Y_one_hot

In [16]:
vectorized_train = Data_Generator(training_set)
vectorized_validation = Data_Generator(validation_set)
vectorized_test = Data_Generator(test_set)

In [17]:
print(vectorized_train.shape)
print(vectorized_validation.shape)
print(vectorized_test.shape)

(87249, 8544)
(87249, 1101)
(87249, 2210)


In [18]:
one_hot_train_labels = Labels_encoder(training_labels, 5)
one_hot_validation_labels = Labels_encoder(validation_labels, 5)
one_hot_test_labels = Labels_encoder(test_labels, 5)

In [19]:
def Parameters_initializer(K, F):
    W = np.random.randn(K, F) * 0.01
    b = np.zeros(K)
    return W, b


def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)


def cross_entropy_loss(Y, Y_hat):
    return -np.mean(np.sum(Y * np.log(Y_hat + 1e-12), axis=0))

def accuracy(Y_pred, Y_true):
    return np.mean(Y_pred == Y_true) * 100

In [20]:
def Train(X, Y, epochs, K, F, lr, batch_size, X_val=None, Y_val=None):
    W, b = Parameters_initializer(K, F)
    N = X.shape[1]

    for epoch in range(epochs):
        indices = np.arange(N)
        np.random.shuffle(indices)
        X_shuffled = X[:, indices]
        Y_shuffled = Y[:, indices]

        epoch_loss = 0
        num_batches = 0

        for i in range(0, N, batch_size):
            batch_indices = indices[i:i + batch_size]
            X_batch = X_shuffled[:, batch_indices]
            Y_batch = Y_shuffled[:, batch_indices]

            # Forward propagation
            Z = np.dot(W, X_batch) + b[:, np.newaxis]
            Y_hat = softmax(Z)

            # Compute loss for the batch
            loss = cross_entropy_loss(Y_batch, Y_hat)
            epoch_loss += loss
            num_batches += 1

            # Backward propagation
            dZ = Y_hat - Y_batch
            dW = np.dot(dZ, X_batch.T) / batch_size
            db = np.mean(dZ, axis=1)

            # Update parameters
            W -= lr * dW
            b -= lr * db

        avg_epoch_loss = epoch_loss / num_batches

        train_pred = np.argmax(np.dot(W, X) + b[:, np.newaxis], axis=0)
        train_true = np.argmax(Y, axis=0)
        train_accuracy = accuracy(train_pred, train_true)

        print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {avg_epoch_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')


        if X_val is not None and Y_val is not None:
            val_pred = np.argmax(np.dot(W, X_val) + b[:, np.newaxis], axis=0)
            val_true = np.argmax(Y_val, axis=0)
            val_accuracy = accuracy(val_pred, val_true)

            print(f'Validation Accuracy: {val_accuracy:.2f}%')

    return W, b

In [21]:
Wo, Bo = Train(vectorized_train, one_hot_train_labels, 120, 5, vec_size, 0.1, 256, vectorized_validation, one_hot_validation_labels)

Epoch 1/120, Average Loss: 1.5898, Training Accuracy: 29.12%
Validation Accuracy: 26.52%
Epoch 2/120, Average Loss: 1.5659, Training Accuracy: 30.70%
Validation Accuracy: 26.25%
Epoch 3/120, Average Loss: 1.5543, Training Accuracy: 32.21%
Validation Accuracy: 26.43%
Epoch 4/120, Average Loss: 1.5460, Training Accuracy: 33.42%
Validation Accuracy: 26.98%
Epoch 5/120, Average Loss: 1.5388, Training Accuracy: 35.67%
Validation Accuracy: 28.16%
Epoch 6/120, Average Loss: 1.5320, Training Accuracy: 38.00%
Validation Accuracy: 29.16%
Epoch 7/120, Average Loss: 1.5252, Training Accuracy: 39.41%
Validation Accuracy: 30.25%
Epoch 8/120, Average Loss: 1.5177, Training Accuracy: 40.82%
Validation Accuracy: 31.24%
Epoch 9/120, Average Loss: 1.5103, Training Accuracy: 41.47%
Validation Accuracy: 31.24%
Epoch 10/120, Average Loss: 1.5043, Training Accuracy: 42.32%
Validation Accuracy: 31.97%
Epoch 11/120, Average Loss: 1.4986, Training Accuracy: 42.65%
Validation Accuracy: 32.15%
Epoch 12/120, Avera

# Part 3: Confusion Matrix & Evaluation Metrics

In [22]:
def confusion_matrix(y_true, y_pred):
  matrix = np.zeros((5, 5), dtype=int)
  for i in range(len(y_true)):
      matrix[y_true[i]][y_pred[i]] += 1
  return matrix


def evaluation(confusion_matrix):
  precision = np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=0)
  recall = np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=1)
  f1_score = 2 * (precision * recall) / (precision + recall)

  # calculating the average across classes (macro) nanmean handles NaN values
  macro_precision = np.nanmean(precision)
  macro_recall = np.nanmean(recall)
  macro_f1 = np.nanmean(f1_score)

  return precision, recall, f1_score, macro_precision, macro_recall, macro_f1


## 3.1 Naive Bayes Evaluation and Comparison with Scikit Learn

In [23]:
# Naive bayes evaluation
true_labels = []
predicted_labels = []

for i in range(test_df.shape[0]):
    true_label = test_df.iloc[i]['label']
    test_doc = test_df.iloc[i]['sentence']
    predicted_label = test_naive_bayes(test_doc, log_prior, log_likelihood, vocab)


    true_labels.append(true_label)
    predicted_labels.append(predicted_label)


cm = confusion_matrix(true_labels, predicted_labels)
precision, recall, f1_score, macro_precision, macro_recall, macro_f1 = evaluation(cm)

print("Confusion Matrix using our implementation:\n")
print(cm)
print("\nPer-Class-Metrics\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("\nMacro-Averaged Metrics\nMacro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Macro F1 Score:", macro_f1)


# Comparing with Scikit learn
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print("\nConfusion Matrix using Scikit learn:\n")

sk_cm = confusion_matrix(true_labels, predicted_labels)
print(sk_cm)

# Calculate per-class precision, recall, and F1 score
per_class_precision = precision_score(true_labels, predicted_labels, average=None)
per_class_recall = recall_score(true_labels, predicted_labels, average=None)
per_class_f1 = f1_score(true_labels, predicted_labels, average=None)

# Calculate macro-averaged metrics
macro_precision = precision_score(true_labels, predicted_labels, average='macro')
macro_recall = recall_score(true_labels, predicted_labels, average='macro')
macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

# Display results
print("\nPer-Class Metrics:")
print("Precision per class:", per_class_precision)
print("Recall per class:", per_class_recall)
print("F1 Score per class:", per_class_f1)
print("\nMacro-Averaged Metrics:")
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Macro F1 Score:", macro_f1)

Confusion Matrix using our implementation:

[[ 62 151  29  33   4]
 [ 68 339 107 103  16]
 [ 22 141  89 120  17]
 [ 13  83  61 256  97]
 [  7  36  34 164 158]]

Per-Class-Metrics
Precision: [0.36046512 0.452      0.278125   0.37869822 0.54109589]
Recall: [0.22222222 0.53554502 0.22879177 0.50196078 0.39598997]
F1 Score: [0.27494457 0.49023861 0.25105783 0.4317032  0.45730825]

Macro-Averaged Metrics
Macro Precision: 0.4020768463084199
Macro Recall: 0.37690195578977875
Macro F1 Score: 0.38105049204593033

Confusion Matrix using Scikit learn:

[[ 62 151  29  33   4]
 [ 68 339 107 103  16]
 [ 22 141  89 120  17]
 [ 13  83  61 256  97]
 [  7  36  34 164 158]]

Per-Class Metrics:
Precision per class: [0.36046512 0.452      0.278125   0.37869822 0.54109589]
Recall per class: [0.22222222 0.53554502 0.22879177 0.50196078 0.39598997]
F1 Score per class: [0.27494457 0.49023861 0.25105783 0.4317032  0.45730825]

Macro-Averaged Metrics:
Macro Precision: 0.4020768463084199
Macro Recall: 0.376901955

## 3.2 Logistic Regression Evaluation and Comparison with Scikit Learn

In [24]:
#LR Evaluation

# Training set
X_train = vectorized_train
Y_train = np.argmax(one_hot_train_labels, axis=0)

# Test Set
X_test = vectorized_test
Y_test = np.argmax(one_hot_test_labels, axis=0)


sgd_model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42)
sgd_model.fit(X_train.T, Y_train)


logreg_model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
logreg_model.fit(X_train.T, Y_train)

Y_pred_SGD = sgd_model.predict(X_test.T)
Y_pred_logreg = logreg_model.predict(X_test.T)
Y_pred_naive = np.argmax(np.dot(Wo, X_test) + Bo[:, np.newaxis], axis=0)


sgd_model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42)
logreg_model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)


cm = confusion_matrix(Y_test, Y_pred_naive)
precision, recall, f1_score, macro_precision, macro_recall, macro_f1 = evaluation(cm)

print("Confusion Matrix using our implementation:\n")
print(cm)
print("\nPer-Class-Metrics\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("\nMacro-Averaged Metrics\nMacro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Macro F1 Score:", macro_f1)


cm = confusion_matrix(Y_test, Y_pred_SGD)
precision, recall, f1_score, macro_precision, macro_recall, macro_f1 = evaluation(cm)
print("Confusion Matrix using SGDClassifier:\n")
print(cm)
print("\nPer-Class-Metrics\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("\nMacro-Averaged Metrics\nMacro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Macro F1 Score:", macro_f1)

cm = confusion_matrix(Y_test, Y_pred_logreg)
precision, recall, f1_score, macro_precision, macro_recall, macro_f1 = evaluation(cm)

print("Confusion Matrix using LogisticRegression:\n")
print(cm)
print("\nPer-Class-Metrics\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("\nMacro-Averaged Metrics\nMacro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Macro F1 Score:", macro_f1)

Confusion Matrix using our implementation:

[[370  38 213   5   7]
 [195  19 165   6   4]
 [142  28 323  16   1]
 [ 95  17 243  39   1]
 [173  12  82   7   9]]

Per-Class-Metrics
Precision: [0.37948718 0.16666667 0.31481481 0.53424658 0.40909091]
Recall: [0.58451817 0.04884319 0.63333333 0.09873418 0.03180212]
F1 Score: [0.460199   0.07554672 0.42057292 0.16666667 0.05901639]

Macro-Averaged Metrics
Macro Precision: 0.36086122908040713
Macro Recall: 0.27944619716141805
Macro F1 Score: 0.23640034028659782
Confusion Matrix using SGDClassifier:

[[326  94 167  19  27]
 [164  50 150  13  12]
 [114  62 280  47   7]
 [ 70  34 199  87   5]
 [136  28  66  20  33]]

Per-Class-Metrics
Precision: [0.40246914 0.18656716 0.32482599 0.46774194 0.39285714]
Recall: [0.5150079  0.1285347  0.54901961 0.22025316 0.11660777]
F1 Score: [0.45183645 0.152207   0.40816327 0.29948365 0.17983651]

Macro-Averaged Metrics
Macro Precision: 0.35489227288029473
Macro Recall: 0.30588462990320486
Macro F1 Score: 0.298

# Conclusion

On comparing the scikit-learn implementations of `SGDClassifier` and `LogisticRegression`, each has its advantages based on dataset size and intended use:



## SGDClassifier

- It uses stochastic gradient descent (SGD) directly, providing flexibility for:

  - **Online Learning**: Supports continuous updates with each batch.

  - **Mini-batch Training**: Effective in situations where data needs to be processed incrementally.

- Generally better for large datasets due to its incremental learning approach.



## LogisticRegression

- Relies on solvers like:

  - `liblinear`, `lbfgs`, `sag`, and `saga`, which are optimized for small to medium-sized datasets.

- More efficient for in-memory datasets, typically achieving faster convergence.

- More suited for:

  - Smaller datasets

  - Situations where highly accurate convergence is needed within a limited number of iterations.



### Which is Closer to Our Implementation?

Our implementation closely resembles `SGDClassifier` with `loss='log_loss'`. This similarity is due to:

- Use of mini-batch gradient descent with configurable batch sizes, aligning well with `SGDClassifier`'s SGD-based optimization strategy.

- A similar approach to parameter updates in an iterative manner.
