## Preprocessing for Classfication

In [1]:
!pip install datasets




- Import libraries

In [2]:
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

- Load dataset

In [3]:
dataset = load_dataset("stanfordnlp/sst", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


Converting dataset into numpy arrays

In [4]:
train_text = np.array(dataset["train"]["sentence"])
train_labels = np.array(dataset["train"]["label"])

valid_text = np.array(dataset["validation"]["sentence"])
valid_labels = np.array(dataset["validation"]["label"])

test_text = np.array(dataset["test"]["sentence"])
test_labels = np.array(dataset["test"]["label"])

In [6]:
print(train_labels[0])

0.6944400072097778


Converting labels to classes

In [5]:
bounds = np.array([0.2,0.4,0.6,0.8,1.0])
train_labels = np.digitize(train_labels,bounds,True)
valid_labels = np.digitize(valid_labels,bounds,True)
test_labels = np.digitize(test_labels,bounds,True)

In [6]:
print(train_labels)
print(train_labels[0])

[3 4 3 ... 3 0 1]
3


# Part 2.1: Naive Bayes Classification

# Part 2.2: Logistic Regression

## 2.2.1: Feauture Representation

- Function to extract bi-grams from a given sentence

In [7]:
def extract_bigrams(sentence):
    words = np.array(sentence.lower().split())  # Convert sentence to lowercase and split
    if len(words) < 2:
        return np.array([])  # Return empty if less than two words
    return np.column_stack((words[:-1], words[1:]))  # Create bi-grams # Remove duplicates and axis = 0 check uniqueness row-wise

# Test the function
sample_sentence = "I love this movie very much I love"
print("Bi-grams:", extract_bigrams(sample_sentence))

Bi-grams: [['i' 'love']
 ['love' 'this']
 ['this' 'movie']
 ['movie' 'very']
 ['very' 'much']
 ['much' 'i']
 ['i' 'love']]


- Construct a vocabulary of all unique bi-grams in the dataset.

In [8]:
# Extract all bi-grams from training sentences
all_bigrams = np.concatenate([extract_bigrams(sentence) for sentence in train_text])

# Get unique bi-grams
unique_bigrams, indices = np.unique(all_bigrams, axis=0, return_index=True)

# Create vocabulary using dictionary for easy indexing when creating vector
bigram_vocab = {tuple(unique_bigrams[i]): i for i in range(len(unique_bigrams))}

#print(bigram_vocab)
print(f"Total unique bi-grams in vocabulary: {len(bigram_vocab)}")

Total unique bi-grams in vocabulary: 84518


- Each sentence is then represented by a vector of length equal to the number of unique bi-grams, with a 1 if the bi-gram occurs in the sentence and a 0 otherwise (this is a sparse representation).

In [9]:
def sentence_to_vector(sentence, vocab):
    vector = np.zeros(len(vocab), dtype=np.int8)  # Initialize zero vector
    bigrams = extract_bigrams(sentence)

    if len(bigrams) == 0:
        return vector  # Return empty vector if no bi-grams

    for bigram in bigrams:
        bigram_tuple = tuple(bigram)
        if bigram_tuple in vocab:
            vector[vocab[bigram_tuple]] = 1  # Set 1 if bi-gram is in vocabulary
    return vector

# Test the function
test_sentence = "! '"
test_vector = sentence_to_vector(test_sentence, bigram_vocab)

print(f"Test Sentence: {test_sentence}")
print(f"Feature Vector: {test_vector}")

Test Sentence: ! '
Feature Vector: [1 0 0 ... 0 0 0]


In [10]:
# Convert the dataset
X_train = np.array([sentence_to_vector(sentence, bigram_vocab) for sentence in train_text], dtype=np.float32)
X_valid = np.array([sentence_to_vector(sentence, bigram_vocab) for sentence in valid_text], dtype=np.float32)
X_test = np.array([sentence_to_vector(sentence, bigram_vocab) for sentence in test_text], dtype=np.float32)

y_train = train_labels.astype(np.int8)
y_valid = valid_labels.astype(np.int8)
y_test = test_labels.astype(np.int8)

print(f"Train Feature Matrix Shape: {X_train.shape}")
print(f"Validation Feature Matrix Shape: {X_valid.shape}")
print(f"Test Feature Matrix Shape: {X_test.shape}")

Train Feature Matrix Shape: (8544, 84518)
Validation Feature Matrix Shape: (1101, 84518)
Test Feature Matrix Shape: (2210, 84518)


## 2.2.2: Algorithm Implementation (From Scratch)

1. **Sigmoid Function:**
   $$
   \sigma(z) = \frac{1}{1 + e^{-z}}
   $$
   where $z = W \cdot X + b$.

2. **Regularized Loss Function (Binary Cross-Entropy with L2 Regularization):**
   $$
   L_{reg} = -\frac{1}{m} \sum_{i=1}^{m} \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right] + \frac{\lambda}{2m} \sum_{j=1}^{n} W_j^2
   $$
   where:
   - $\hat{y}_i = \sigma(W \cdot X_i + b)$
   - $\lambda$ is the regularization strength.
   - $W_j^2$ penalizes large weights.

3. **Gradient Updates with L2 Regularization:**
   $$
   W = W - \alpha \left( \frac{\partial L}{\partial W} + \lambda W \right)
   $$
   $$
   b = b - \alpha \frac{\partial L}{\partial b}
   $$
   where:
   - $\frac{\partial L}{\partial W} = \frac{1}{m} X^T (\sigma(W \cdot X + b) - y)$
   - $\frac{\partial L}{\partial b} = \frac{1}{m} \sum (\sigma(W \cdot X + b) - y) $
   - $\alpha$ is the learning rate.


In [11]:
class LogisticRegressionSGD:
    def __init__(self, lr=0.01, epochs=100, batch_size=32, reg_lambda=0.01):
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.reg_lambda = reg_lambda
        self.weights = None
        self.bias = None

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def loss(self, y, y_pred):
        m = y.shape[0]
        return -np.sum(y * np.log(y_pred + 1e-8)) / m + (self.reg_lambda / (2 * m)) * np.sum(self.weights ** 2)

    def train(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.weights = np.zeros((n_features, n_classes), dtype=np.float32)
        self.bias = np.zeros(n_classes, dtype=np.float32)

        # One-hot encode labels
        y_one_hot = np.eye(n_classes, dtype=np.float32)[y]

        for epoch in range(self.epochs):
            indices = np.arange(n_samples)
            np.random.shuffle(indices)
            X, y_one_hot = X[indices], y_one_hot[indices]

            del indices  # Free memory

            for i in range(0, n_samples, self.batch_size):
                X_batch = X[i:i+self.batch_size]
                y_batch = y_one_hot[i:i+self.batch_size]

                y_pred = self.softmax(np.dot(X_batch, self.weights) + self.bias)

                dw = (1 / len(y_batch)) * np.dot(X_batch.T, (y_pred - y_batch)) + (self.reg_lambda / len(y_batch)) * self.weights
                db = np.mean(y_pred - y_batch, axis=0)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            loss = self.loss(y_one_hot, self.softmax(np.dot(X, self.weights) + self.bias))
            print(f"Epoch {epoch}, Loss: {loss}")

    def predict(self, X):
        return np.argmax(self.softmax(np.dot(X, self.weights) + self.bias), axis=1)

model = LogisticRegressionSGD(lr=0.01, epochs=100, batch_size=32, reg_lambda=0.01)
model.train(X_train, train_labels)

y_pred = model.predict(X_test)
accuracy = np.mean(y_pred == test_labels)
print(f"Logistic Regression Accuracy: {accuracy*100}")

Epoch 0, Loss: 1.5747337902874414
Epoch 1, Loss: 1.5594941233395387
Epoch 2, Loss: 1.5504207395072782
Epoch 3, Loss: 1.5434409700497316
Epoch 4, Loss: 1.537163121796506
Epoch 5, Loss: 1.5312137831754506
Epoch 6, Loss: 1.5254564813005107
Epoch 7, Loss: 1.519808693507526
Epoch 8, Loss: 1.5142564750473233
Epoch 9, Loss: 1.5087877095267455
Epoch 10, Loss: 1.5033964535638633
Epoch 11, Loss: 1.498078820482594
Epoch 12, Loss: 1.4928305811372198
Epoch 13, Loss: 1.4876480776293393
Epoch 14, Loss: 1.4825295953777677
Epoch 15, Loss: 1.4774731909111143
Epoch 16, Loss: 1.4724745204810348
Epoch 17, Loss: 1.467530612005444
Epoch 18, Loss: 1.4626416940423435
Epoch 19, Loss: 1.457808223951343
Epoch 20, Loss: 1.4530225426494405
Epoch 21, Loss: 1.448287736698427
Epoch 22, Loss: 1.4435994624890638
Epoch 23, Loss: 1.4389582912417387
Epoch 24, Loss: 1.4343615935608913
Epoch 25, Loss: 1.4298076552621435
Epoch 26, Loss: 1.425294875971945
Epoch 27, Loss: 1.4208247411200838
Epoch 28, Loss: 1.4163967933380202
Ep

- Without Regularization

In [13]:
class LogisticRegressionSGD:
    def __init__(self, lr=0.01, epochs=100, batch_size=32):
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.weights = None
        self.bias = None

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def loss(self, y, y_pred):
        m = y.shape[0]
        return -np.sum(y * np.log(y_pred + 1e-8)) / m

    def train(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.weights = np.zeros((n_features, n_classes), dtype=np.float32)
        self.bias = np.zeros(n_classes, dtype=np.float32)

        # One-hot encode labels
        y_one_hot = np.eye(n_classes)[y]

        for epoch in range(self.epochs):
            indices = np.arange(n_samples)
            np.random.shuffle(indices)
            X, y_one_hot = X[indices], y_one_hot[indices]

            del indices

            for i in range(0, n_samples, self.batch_size):
                X_batch = X[i:i+self.batch_size]
                y_batch = y_one_hot[i:i+self.batch_size]

                y_pred = self.softmax(np.dot(X_batch, self.weights) + self.bias)

                dw = (1 / len(y_batch)) * np.dot(X_batch.T, (y_pred - y_batch))
                db = np.mean(y_pred - y_batch, axis=0)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            loss = self.loss(y_one_hot, self.softmax(np.dot(X, self.weights) + self.bias))
            print(f"Epoch {epoch}, Loss: {loss}")

    def predict(self, X):
        return np.argmax(self.softmax(np.dot(X, self.weights) + self.bias), axis=1)

# Train model with Softmax and Cross-Entropy Loss
model = LogisticRegressionSGD(lr=0.01, epochs=100, batch_size=32)
model.train(X_train, train_labels)

y_pred = model.predict(X_test)
accuracy = np.mean(y_pred == test_labels)
print(f"Logistic Regression Accuracy: {accuracy*100}")

Epoch 0, Loss: 1.574819327209996
Epoch 1, Loss: 1.5596076986902438
Epoch 2, Loss: 1.5504468274529508
Epoch 3, Loss: 1.543409004277281
Epoch 4, Loss: 1.5371144132109618
Epoch 5, Loss: 1.5311484856719382
Epoch 6, Loss: 1.5253421623217926
Epoch 7, Loss: 1.5196614747427488
Epoch 8, Loss: 1.5140680398805981
Epoch 9, Loss: 1.5085544687127241
Epoch 10, Loss: 1.5031149267648043
Epoch 11, Loss: 1.4977461418018359
Epoch 12, Loss: 1.492445072194052
Epoch 13, Loss: 1.4872084564707253
Epoch 14, Loss: 1.4820280785947926
Epoch 15, Loss: 1.4769105395016153
Epoch 16, Loss: 1.4718455534065595
Epoch 17, Loss: 1.4668329759441623
Epoch 18, Loss: 1.4618741271298812
Epoch 19, Loss: 1.4569658991158678
Epoch 20, Loss: 1.4521022377054342
Epoch 21, Loss: 1.4472878900555413
Epoch 22, Loss: 1.4425129565839352
Epoch 23, Loss: 1.4377849592098249
Epoch 24, Loss: 1.4330988302966405
Epoch 25, Loss: 1.428452049064614
Epoch 26, Loss: 1.4238477860078607
Epoch 27, Loss: 1.419278154651938
Epoch 28, Loss: 1.4147495119745253


- ## Comparison with scikit-learn

Since **X_train** is a high-dimensional feature matrix (one-hot encoded bigrams).
Most values in this matrix are zero (because each sentence contains only a few bigrams out of a huge vocabulary).
- Dense storage wastes memory by keeping all the zeros.

- Sparse matrices solve this by only storing nonzero values and their positions.

In [16]:
from scipy.sparse import csr_matrix
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)

bgd = LogisticRegression(max_iter=100, solver='lbfgs')
bgd.fit(X_train, train_labels)
bgd_accuracy = bgd.score(X_test, test_labels)

sgd = SGDClassifier(loss="log_loss", max_iter=100, learning_rate="optimal")
sgd.fit(X_train, train_labels)
sgd_accuracy = sgd.score(X_test, test_labels)

print(f"Logistic Regression Accuracy (Batch Gradient Descent): {bgd_accuracy*100}")
print(f"Logistic Regression Accuracy (Stochastic Gradient Descent): {sgd_accuracy*100}")

Logistic Regression Accuracy (Batch Gradient Descent): 36.10859728506787
Logistic Regression Accuracy (Stochastic Gradient Descent): 35.97285067873303


# Part 2.3: Confusion Matrix & Evaluation Metrics

- Implementation (from scratch)

In [12]:
def compute_confusion_matrix(y_true, y_pred, num_classes):
    matrix = np.zeros((num_classes, num_classes), dtype=int)
    for true, pred in zip(y_true, y_pred):
        matrix[true, pred] += 1
    return matrix

def compute_metrics(conf_matrix):
    precision = np.diag(conf_matrix) / (np.sum(conf_matrix, axis=0) + 1e-8)
    recall = np.diag(conf_matrix) / (np.sum(conf_matrix, axis=1) + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)

    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    macro_f1 = np.mean(f1)

    return precision, recall, f1, macro_precision, macro_recall, macro_f1

In [13]:
# From scratch confusion matrix and metrics
num_classes = len(np.unique(test_labels))
conf_matrix = compute_confusion_matrix(test_labels, y_pred, num_classes)
precision, recall, f1, macro_precision, macro_recall, macro_f1 = compute_metrics(conf_matrix)

print("From scratch Confusion Matrix:")
print(conf_matrix)
print(f"Precision per class: {precision}")
print(f"Recall per class: {recall}")
print(f"F1-score per class: {f1}")
print(f"Macro Precision: {macro_precision}")
print(f"Macro Recall: {macro_recall}")
print(f"Macro F1-score: {macro_f1}")

From scratch Confusion Matrix:
[[  3 172   7  92   5]
 [  2 372  27 226   6]
 [  2 194  12 173   8]
 [  1 139  21 340   9]
 [  0  92   9 258  40]]
Precision per class: [0.375      0.38390093 0.15789474 0.31221304 0.58823529]
Recall per class: [0.01075269 0.58767773 0.03084833 0.66666667 0.10025063]
F1-score per class: [0.02090592 0.46441947 0.0516129  0.42526579 0.17130621]
Macro Precision: 0.3634487997310459
Macro Recall: 0.279239207109281
Macro F1-score: 0.22670205765689583


- Comparison with scikit-learn

In [14]:
# Compare with sklearn's metrics
sklearn_conf_matrix = confusion_matrix(test_labels, y_pred)
sklearn_precision = precision_score(test_labels, y_pred, average=None)
sklearn_recall = recall_score(test_labels, y_pred, average=None)
sklearn_f1 = f1_score(test_labels, y_pred, average=None)
sklearn_macro_precision = precision_score(test_labels, y_pred, average="macro")
sklearn_macro_recall = recall_score(test_labels, y_pred, average="macro")
sklearn_macro_f1 = f1_score(test_labels, y_pred, average="macro")

print("Scikit-learn Confusion Matrix:")
print(sklearn_conf_matrix)
print(f"Precision per class: {sklearn_precision}")
print(f"Recall per class: {sklearn_recall}")
print(f"F1-score per class: {sklearn_f1}")
print(f"Macro Precision: {sklearn_macro_precision}")
print(f"Macro Recall: {sklearn_macro_recall}")
print(f"Macro F1-score: {sklearn_macro_f1}")

Scikit-learn Confusion Matrix:
[[  3 172   7  92   5]
 [  2 372  27 226   6]
 [  2 194  12 173   8]
 [  1 139  21 340   9]
 [  0  92   9 258  40]]
Precision per class: [0.375      0.38390093 0.15789474 0.31221304 0.58823529]
Recall per class: [0.01075269 0.58767773 0.03084833 0.66666667 0.10025063]
F1-score per class: [0.02090592 0.46441948 0.0516129  0.42526579 0.17130621]
Macro Precision: 0.36344879984761774
Macro Recall: 0.27923920711449046
Macro F1-score: 0.22670206063914833
