## Import necessary libraries

In [54]:
import os
import glob
from collections import defaultdict
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif

## Loading the data from IMDB dataset

In [55]:
def load_data(dir, subset='train'):
    '''
    Load the data from the subset of given directory.
    dir: The path to the directory containing the data.
    subset: The subset of the data to load. Either 'train' or 'test'.

    Returns:
    reviews: A list of strings, each representing a review.
    labels: A list of integers, 1 for positive and 0 for negative.
    '''
    
    reviews = []
    labels = []

    # Path to the reviews
    positive_path = os.path.join(dir, subset, 'pos', '*.txt')
    negative_path = os.path.join(dir, subset, 'neg', '*.txt')

    for file in glob.glob(positive_path):
        with open(file, 'r', encoding='utf-8') as f:
            reviews.append(f.read())
            labels.append(1)
    
    for file in glob.glob(negative_path):
        with open(file, 'r', encoding='utf-8') as f:
            reviews.append(f.read())
            labels.append(0)
    
    return reviews, labels

In [56]:
def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')

    # Split text into words based on whitespace
    words = text.split()
    return words

In [57]:
def filter_vocab(reviews, min_f=0.01, max_f=0.5):
    '''
    Using default dictionary to count the frequency of each word in the reviews.
    Default dictionary is a dictionary that has a default value for keys that haven't been added yet.
    '''
    frequency = defaultdict(int)
    num_reviews = len(reviews)

    for review in reviews:
        unique_words = set(review)
        for word in unique_words:
            frequency[word] += 1

    # Filtration
    filtered_words = {word for word, freq in frequency.items()
                      if (freq/num_reviews > min_f and (freq/num_reviews) < max_f)}
    
    return filtered_words

            

In [58]:
def feature_vector(reviews, vocab):
    '''
    Create a feature vector for each review.
    reviews: A list of strings, each representing a review.
    vocab: The set of words that we already filtered.

    Returns:
    A feature vector that each position in the vector corresponds
    to a word in the vocab, and the frequency of that word in the review.
    '''

    # Create a mapping of vocabulary words to their indices
    word_to_index = {word: i for i, word in enumerate(vocab)}
    
    # Initialize an array to hold the feature vectors
    feature_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.float32)
    
    for i, review in enumerate(reviews):
        for word in review:
            if word in word_to_index:
                feature_vectors[i, word_to_index[word]] += 1

    return feature_vectors

In [59]:
# Load the data
path = 'aclImdb'
train_reviews, train_labels = load_data(path, subset='train')
test_reviews, test_labels = load_data(path, subset='test')

# Preprocess the reviews
train_reviews = [preprocess(review) for review in train_reviews]
test_reviews = [preprocess(review) for review in test_reviews]

# Filter the vocabularies
filtered_vocab = filter_vocab(train_reviews)

# Create the feature vectors
train_vectors = feature_vector(train_reviews, filtered_vocab)
test_vectors = feature_vector(test_reviews, filtered_vocab)

# Feature selection usign Linear Regression

In [60]:
class linear_regression:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        return
    
    def fit(self):
        # Add a column of ones to the X matrix for the bias term
        X = np.hstack((np.ones((self.X.shape[0], 1)), self.X))
        # Calculate the weights
        self.w = np.linalg.inv(X.T @ X) @ X.T @ self.y
        return self.w
    
    def predict(self, X):
        # Add a column of ones to the X matrix for the bias term
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return X @ self.w

# Selecting features based on coefficients

In [61]:
'''
1. Identify the top D features with the hightest abosolute coefficients values.
2. Positive coefficients indicate that the word is associated with positive reviews, while negative coefficients indicate that the word is associated with negative reviews.
'''
def top_features(coefficients, vocab, D=100, positive=True):
    if (positive):
        top_indices = np.argsort(coefficients)[-D:]
    else:
        top_indices = np.argsort(coefficients)[:D]
    top_words = [word for i, word in enumerate(vocab) if i in top_indices]
    return top_words

In [62]:
# Train the linear regression model
model = linear_regression(train_vectors, train_labels)
coefficients = model.fit()

# Get the top positive features, if you want to get the top negative, change the Positve=False
top_words = top_features(coefficients, filtered_vocab, 100, True)

# Print the top features
print(top_words)

['apparently', 'nudity', 'social', 'enjoyed', 'listen', 'historical', 'difference', 'typical', 'accept', 'director', 'mad', 'directing', 'mood', 'seen', 'while', 'scenes,', 'so.', '.', 'perfectly', 'felt', 'out', 'special', 'reading', 'sick', 'music', 'asks', 'old', 'thing,', 'certain', 'all,', 'nasty', 'does', 'basically', 'war', 'filled', 'also,', 'aspects', 'dark', 'space', 'expecting', 'public', 'large', 'role,', 'flick.', 'better', 'avoid', 'place', 'light', "couldn't", 'room', 'no', 'right.', 'held', 'age', 'set', 'sequences', 'when', 'saying', 'act', 'great.', 'audiences', 'once', 'course', 'pace', 'watch.', 'miss', 'touching', 'kills', 'long', 'together.', 'awful.', 'shame', 'quickly', 'end,', 'three', 'that,', 'none', 'offers', 'big', 'tale', 'ever', 'confused', 'wife', 'basic', 'create', 'know', 'good', 'point.', 'had', 'has', 'that.', 'brilliant', 'moral', 'behind', 'keeps', 'itself.', 'knows', 'bad.', 'hands', 'films']


# Loading the dataset 20newsgroups

In [63]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.guns', 'misc.forsale']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)

# Preprocessing

In [64]:
# Filter the vocabularies and Create the feature vectors using sklearn library this time
# TfidfVectorizer combines TfidfTransformer and CountVectorizer
vectorizer = TfidfVectorizer(max_features=500)  # Limit to 500 features, you can change it to any number you want
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

# Select the top features using mutual information

In [84]:
def top_features_per_class(X_train, y_train, num_features_per_class=10):
    num_classes = np.unique(y_train).shape[0]
    top_features_per_class = set()

    for _class in range(num_classes):
        # Create a binary label for the current class
        y_binary = (y_train == _class).astype(int)

        # Calculate the mutual information score for each feature
        mi_score = mutual_info_classif(X_train, y_binary, discrete_features='auto')

        # Get the indices of the top features
        top_features = np.argsort(mi_score)[-num_features_per_class:]
        top_features_per_class.update(top_features)

    # Convert to a sorted list to maintain consistency
    selected_indices = sorted(list(top_features_per_class))
    
    return selected_indices

In [85]:
selected_indices = top_features_per_class(X_train, newsgroups_train.target, 10)
feature_names = np.array(vectorizer.get_feature_names_out())
top_words = feature_names[list(selected_indices)]

