In [None]:
!nvidia-smi

## Import libraries


In [None]:
import os
import glob
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import load_svmlight_file
import pandas as pd
import gc

## Load and preprocess imdb training data for logistic regression, change path if needed

In [None]:

with open("drive/MyDrive/aclImdb/imdb.vocab", 'r', encoding="utf-8") as file:
    words_list = [line.strip() for line in file]
X, y = load_svmlight_file("drive/MyDrive/aclImdb/train/labeledBow.feat")


# Convert the sparse matrix to a dense matrix
X_dense = X.todense()
del X
gc.collect()
# Convert the matrix to a pandas df, where columns are word indices, rows are reviews and entries are word counts in the review
df_1 = pd.DataFrame(X_dense)
del X_dense
gc.collect()
# A word occurs in a review if its count is non-zero
occurrence_frequency = (df_1 != 0).sum() / len(df_1)
words_to_remove = occurrence_frequency[(occurrence_frequency < 0.01) | (occurrence_frequency > 0.5)].index
df_1_filtered = df_1.drop(columns=words_to_remove)
df_1 = df_1_filtered
filtered_indices_list = df_1.columns.to_list()
filtered_words_list = [words_list[i] for i in filtered_indices_list]
filtered_words_dict = dict(zip(filtered_words_list,filtered_indices_list))


print("Training data loaded and filtered.")

Training data loaded and filtered.


## Using simple linear regression, find the most positive/negative words to reduce vocabulary size

In [None]:
def find_coefficient(df, word_index, ratings):
    # simple linear regression to find coefficient
    X = df.loc[:, word_index].values
    y = np.array(ratings)
    mean_x = np.mean(X)
    mean_y = np.mean(y)
    num = np.sum((X - mean_x) * (y - mean_y))
    denom = np.sum((X - mean_x) ** 2)

    return num / denom
def assign_coefficients(df, ratings):
    tuple_list = []
    for i in np.array(df.columns):
        tuple_list.append(tuple((i,find_coefficient(df,i,ratings))))

    return sorted(tuple_list, key=lambda x: x[1])
coefficient_tuples = assign_coefficients(df_1,y)
coefficients_dict = {}
for ind, coef in coefficient_tuples:
    coefficients_dict.setdefault(ind, coef)

def tokenize_df(str, _indices_list, _words_list, _words_dict):
    # converts a string, presumably a review, into a pandas dataframe row with word indices and word counts
    _row = dict(zip(_indices_list,[0 for _ in _words_list]))
    _words = str.lower().split()
    _words = [(''.join(char for char in word if (char.isalnum() or char == '\''))) for word in _words]
    for i in range(len(_words)):
        if _words[i] in _words_list:
            _row[_words_dict[_words[i]]] += 1
    _df = pd.DataFrame([_row])
    return _df

def tokenize(str, _indices_list, _words_list, _words_dict):
    # converts a string, presumably a review, into an array with word counts
    _row = dict(zip(_indices_list,[0 for _ in _words_list]))
    _words = str.lower().split()
    _words = [(''.join(char for char in word if (char.isalnum() or char == '\''))) for word in _words]
    for i in range(len(_words)):
        if _words[i] in _words_list:
            _row[_words_dict[_words[i]]] += 1
    _arr = np.array(list(_row.values()))
    return _arr

print("By simple LR, some of the most NEGATIVE words are: ", [words_list[i[0]] for i in coefficient_tuples[:12]])
print("----------------------------------------------------------------")
print("By simple LR, some of the most POSITIVE words are: ", [words_list[i[0]] for i in coefficient_tuples[-12:]])

By simple LR, some of the most NEGATIVE words are:  ['redeeming', 'waste', 'laughable', 'pointless', 'pathetic', 'garbage', 'worst', 'avoid', 'wasted', 'excuse', 'awful', 'whatsoever']
----------------------------------------------------------------
By simple LR, some of the most POSITIVE words are:  ['terrific', 'recommended', 'fantastic', 'gem', 'wonderful', 'delightful', 'finest', 'touching', 'outstanding', 'superb', 'beautifully', 'wonderfully']


## Choose the 300 most positive/negative words as vocabulary

In [None]:
chosen_tuples = coefficient_tuples[-150:] + coefficient_tuples[:150]
df_1_chosen = df_1[[word[0] for word in chosen_tuples]]
chosen_indices_list = df_1_chosen.columns.to_list()
chosen_words_list = np.array([words_list[i] for i in chosen_indices_list])
chosen_words_dict = dict(zip(chosen_words_list,chosen_indices_list))

## Define logistic regressor class

In [1]:
class LogisticRegressor:
    X = None
    y = None
    bias = 0
    learning_rate = 0
    weights = {}
    def __init__(self, X, y, chosen_indices_list, learning_rate = 0.001, max_iter = 1000):
        self.X = X
        self.y = y
        self.chosen_indices_list = chosen_indices_list
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.weights = np.array([0 for _ in chosen_words_list])
        self.bias = 0
    def predict(self,_word_count_list):
        _exponent = np.dot(_word_count_list,self.weights) + self.bias
        return 1 / (1 + (np.exp(-_exponent)))
    def predict_str(self,str):
        _tokenized_arr = tokenize(str,chosen_indices_list,chosen_words_list,chosen_words_dict)
        return self.predict(_tokenized_arr)
    def predict_bin(self, _word_count_list):
        return 1 if self.predict(_word_count_list) > 0.5 else 0
    def fit(self, X, y):
        self.X = X
        self.y = y
        X_rearranged = X.reindex(columns = self.chosen_indices_list)
        X_matrix = X_rearranged.to_numpy()
        r = len(X_matrix)
        for _ in range(self.max_iter):
            predictions = [self.predict(X_matrix[row]) for row in range(len(y))]
            dw = (1 / r) * np.dot(X_matrix.T, (predictions - y))
            db = (1 / r) * np.sum(predictions - y)
            self.weights = self.weights - self.learning_rate * dw
            self.bias -= self.learning_rate * db

## Load test data for imdb and check accuracy

In [None]:
y_bin = np.array([(1 if i > 5 else 0) for i in y])
logTest = LogisticRegressor(df_1_chosen,y_bin,chosen_indices_list, learning_rate=0.004,max_iter=4000)
logTest.fit(df_1_chosen,y_bin)
# print(logTest.predict_str("This is the most wonderful, moving, fantastic movie ever made."))
# print(logTest.predict_str("This is the most awful, pointless, pathetic movie ever made."))

X_test, y_test = load_svmlight_file("drive/MyDrive/aclImdb/test/labeledBow.feat")
y_test_bin = np.array([(1 if i > 5 else 0) for i in y_test])
X_test_dense = X_test.todense()
del X_test
gc.collect()
df_t1 = pd.DataFrame(X_test_dense)
del y_test
gc.collect()
# print(df_t1.columns.to_list())
df_t1 = df_t1[[word[0] for word in chosen_tuples]]
df_t1 = df_t1.reindex(columns = chosen_indices_list)
matrix_t1 = df_t1.to_numpy()
del df_t1
gc.collect()

predicted_arr = np.array([logTest.predict_bin(row) for row in matrix_t1])
accuracy = 0
for i in range(len(y_test_bin)):
    accuracy += 1 if y_test_bin[i] == predicted_arr[i] else 0
accuracy /= len(y_test_bin)
print("Accuracy is:",  accuracy)

Accuracy is: 0.8124


# Loading the dataset 20newsgroups

In [None]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.guns', 'misc.forsale']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)

# Preprocessing

In [None]:
# Filter the vocabularies and Create the feature vectors using sklearn library this time
# TfidfVectorizer combines TfidfTransformer and CountVectorizer
vectorizer = TfidfVectorizer(max_features=500)  # Limit to 500 features, you can change it to any number you want
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

# Select the top features using mutual information

In [None]:
def top_features_per_class(X_train, y_train, num_features_per_class=10):
    num_classes = np.unique(y_train).shape[0]
    top_features_per_class = set()

    for _class in range(num_classes):
        # Create a binary label for the current class
        y_binary = (y_train == _class).astype(int)

        # Calculate the mutual information score for each feature
        mi_score = mutual_info_classif(X_train, y_binary, discrete_features='auto')

        # Get the indices of the top features
        top_features = np.argsort(mi_score)[-num_features_per_class:]
        top_features_per_class.update(top_features)

    # Convert to a sorted list to maintain consistency
    selected_indices = sorted(list(top_features_per_class))

    return selected_indices

In [None]:
selected_indices = top_features_per_class(X_train, newsgroups_train.target, 10)
feature_names = np.array(vectorizer.get_feature_names_out())
top_words = feature_names[list(selected_indices)]

