In [72]:
## Importing libraries 

import numpy as np
from collections import defaultdict
import random 

### Extracting the Data

using (data/sst-sentiment-text-threeclass)

In [2]:
def extract_data(dirc: str) -> tuple[list[str], list[int]]:
    input_X = []
    output_y = []
    with open(dirc, 'r') as file:
        for line in file:
            label, data = line.strip().split('|||')
            input_X.append(data)
            output_y.append(int(label))
    return input_X, output_y

X_train, Y_train = extract_data("/Users/varunmoparthi/Desktop/language-model/data/sst-sentiment-text-threeclass/train.txt")
X_test, Y_test = extract_data("/Users/varunmoparthi/Desktop/language-model/data/sst-sentiment-text-threeclass/test.txt")

In [3]:
len(X_train), len(X_test)

(8544, 2210)

##### 1. Feature Extraction

In [51]:
'''''
    Extract the feature for every review as a dictonary 
    containing good_word_count, bad_word_count, bias.
'''''

def feature_extract(text: str) -> defaultdict[str, int]:
    feature = defaultdict()
    
    # These words are manually added (like a manual rule for considering which one is good and bad)
    good_words = ["good"]
    bad_words = ["bad"]

    text_words  = text.split(' ')

    for word in text_words:
        if word in good_words:
            feature["good_word_count"] = feature.get("good_word_count", 0) + 1
        if word in bad_words:
            feature["bad_word_count"] = feature.get("bad_word_count", 0) + 1
    feature["bias"] = 1

    return feature


Feature Weights (Fixed)

In [12]:
## These weights can be changed manually to tune the performance

fetaure_weights = { "good_word_count": 1.0,
                   "bad_word_count": -1.0, 
                   "bias": 0.5}

##### 2. Score Calculation

In [58]:
'''''
    score is calculated by dot product of weights and fetaure values.
    Score_X = Weights . X_feature
'''''

def score_calculation(weights: defaultdict, feature: defaultdict) -> float:
    score = 0.0
    for k, v in weights.items():
        score = score + weights.get(k, 0.0) * feature.get(k, 0.0)

    return score

##### 3. Decision Function

In [60]:
'''''
    Depending on the score of the review it will be classifed as 1, -1, 0 .
'''''
def decision_function(x: str) -> int:
    feature = feature_extract(x)
    score = score_calculation(fetaure_weights, feature)

    if score > 0:
        return 1
    elif score < 0:
        return -1
    else:
        return 0

##### 4. Accuracy Calculation

In [61]:
def calculate_accuracy(Y_pred: list[int], Y: list[int]) -> float:
    total = 0
    correct = 0
    for pred, orig in zip(Y_pred, Y):
        total += 1
        if pred == orig:
            correct += 1
    return correct/total

##### Evaluating the dataset

In [62]:
def evaluate_dataset(data: list[int], type: str) -> None:
    label = defaultdict()
    for i in data:
        label[i] = label.get(i, 0) + 1
    print(f"Dataset distribution {type}: ", label)

In [63]:
evaluate_dataset(Y_train, "trainset")
evaluate_dataset(Y_test, "testset")

Dataset distribution trainset:  defaultdict(None, {1: 3610, 0: 1624, -1: 3310})
Dataset distribution testset:  defaultdict(None, {0: 389, 1: 909, -1: 912})


In [None]:
Y_train_pred = []
for text in X_train:
    Y_train_pred.append(decision_function(text))

Y_test_pred = []
for text in X_test:
    Y_test_pred.append(decision_function(text))

print("Trainset accuracy: ", calculate_accuracy(Y_train_pred, Y_train))
print("Testset accuracy: ", calculate_accuracy(Y_test_pred, Y_test))

Trainset accuracy:  0.4327013108614232
Testset accuracy:  0.4239819004524887


The above accuracy is better than random (meaning if we slect yes every time.)

In [69]:
909/ (909 + 912 + 389)

0.4113122171945701

Improve the accuracy : 
1. Add more good and bad words
2. Change the weights in feature_weights after looking at the mistakes (doing error analysis)

#### Error Analysis

In [78]:
'''''
    Find the random 10 text where the perdcition is wrong and checking the text and its label 
'''''
def error_analysis(X: list[str], Y: list[int]) -> None:
    Y_pred = []
    error = []
    for i, (text, y) in enumerate(zip(X, Y)):
        Y_pred.append(decision_function(text))
        if Y[i] != Y_pred[-1]:
            error.append(i)
    
    for _ in range(10):
        id = random.choice(error)
        print("Text: ", X[id])
        print("Predicted: ", Y_pred[id])
        print("Original: ", Y[id], '\n')

In [79]:
error_analysis(X_test, Y_test)

Text:   An instant candidate for worst movie of the year .
Predicted:  1
Original:  -1 

Text:   Some decent actors inflict big damage upon their reputations .
Predicted:  1
Original:  -1 

Text:   All I can say is fuhgeddaboutit .
Predicted:  1
Original:  -1 

Text:   Not a cheap slasher flick , as the subject matter would suggest , but is a little like a nature film , showing a patient predator and his foolish prey .
Predicted:  1
Original:  0 

Text:   Adam Sandler is to Gary Cooper what a gnat is to a racehorse .
Predicted:  1
Original:  -1 

Text:   It does n't help that the director and cinematographer Stephen Kazmierski shoot on grungy video , giving the whole thing a dirty , tasteless feel .
Predicted:  1
Original:  -1 

Text:   Neither funny nor suspenseful nor particularly well-drawn .
Predicted:  1
Original:  -1 

Text:   But like Bruce Springsteen 's gone-to-pot Asbury Park , New Jersey , this sad-sack waste of a movie is a City of ruins .
Predicted:  1
Original:  -1 

Text