# 1. Data Acquisition

### 1.1 Import libraries

In [None]:
import tarfile
import os
import re
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

# Pre-processing libraries
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Import Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
!pip install gensim
from gensim.models import Word2Vec

# ML models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# For evaluation of models
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
folder_path = '/content/drive/MyDrive/Atomcamp/aclImdb_v1.tar.gz'

with tarfile.open(folder_path, 'r:gz') as tar:
  tar.extractall(path='/content/')

### 1.2 Extract and Load the dataset
The test folder has sub folders pos and neg, and both of them contain txt files, one txt file for each review. I will make a function to load all the txt files to a pandas dataframe.

In [None]:
def load_data(folder_path, label):
  data = []
  for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as file_content:
      review = file_content.read()
      data.append((review, label))
  return data

# Train data
neg_data = load_data('/content/aclImdb/train/neg', 0)
pos_data = load_data('/content/aclImdb/train/pos', 1)
train_data = neg_data + pos_data

# Test data
neg_data = load_data('/content/aclImdb/test/neg', 0)
pos_data = load_data('/content/aclImdb/test/pos', 1)
test_data = neg_data + pos_data

### 1.3 Clean HTML Tags and create DataFrame of the dataset
When I explored the data, there were html tags present in the reviews. I will used re library to match the html tags and remove them

In [None]:
# Making a DataFrame for train data
df_train = pd.DataFrame(train_data, columns=['Review', 'Label'])
df_train['Review'] = df_train['Review'].apply(lambda x: re.sub(r'<.*?>', '', x))

# Making a DataFrame for test data
df_test = pd.DataFrame(test_data, columns=['Review', 'Label'])
df_test['Review'] = df_test['Review'].apply(lambda x: re.sub(r'<.*?>', '', x))

In [None]:
df_test.head(2)


In [None]:
df_train.head(2)

# 2. Data Exploration

In [None]:
print('Length of train data: ', len(df_train))
print('Positive Classes: ', len(df_train[df_train['Label']==1]))
print('Negative Classes: ', len(df_train[df_train['Label']==0]))

print('\nLength of test data: ', len(df_test))
print('Positive Classes: ', len(df_test[df_test['Label']==1]))
print('Negative Classes: ', len(df_test[df_test['Label']==0]))

### Data splitting

The orignal data has equal samples in test and train data (25000 samples each). However, since it is required to split the data into train, validataion and test in 70:10:20 ratio, I'll first combine all the data, and then split according to this ratio.

In [None]:
df_combined = pd.concat([df_train, df_test], ignore_index=True)
df_train, df_val_test = train_test_split(df_combined, random_state=42, test_size=0.3, stratify=df_combined['Label'])
df_val, df_test = train_test_split(df_val_test, random_state=42, stratify=df_val_test['Label'], test_size=2/3)

print('Test data length: ', len(df_train))
print('Validation data length: ', len(df_val))
print('Test data length: ', len(df_test))



# 3. Preprocessing the data

In [None]:
# Pre-processing function
def preprocess_text(text):
  # Convert to lowercase
  text = text.lower()
  # Remove special characters and digits
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Tokenize
  tokens = word_tokenize(text)
  # Stopwords removal
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  # Stemming the tokens
  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(word) for word in tokens]

  return ' '.join(stemmed_tokens)
  # Lemmetization of tokens
  # lemmatizer = WordNetLemmatizer()
  # lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

  #return stemmed_tokens

In [None]:
df_train['Review'] = df_train['Review'].apply(preprocess_text)

df_val['Review'] = df_val['Review'].apply(preprocess_text)

df_test['Review'] = df_test['Review'].apply(preprocess_text)

In [None]:
df_train.head(2)

The dataset is now clean. I will now proceed to feature enengineering and modeling part.

# 3. Feature Engineering

### 3.1 Extracting features with BoW and TF-IDF

I will extract features using Bag of Words and TF-IDF Vectorizers, considering both unigrams and bigrams, so that more meaningful features are extracted

In [None]:
# Representation in CountVectorizer
count_vec = CountVectorizer(ngram_range=(1,2))

X_train_count = count_vec.fit_transform(df_train['Review'])
X_val_count = count_vec.transform(df_val['Review'])
X_test_count = count_vec.transform(df_test['Review'])

# Representation in TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(ngram_range=(1,2))

X_train_tfidf = tfidf_vec.fit_transform(df_train['Review'])
X_val_tfidf = tfidf_vec.transform(df_val['Review'])
X_test_tfidf = tfidf_vec.transform(df_test['Review'])

In [None]:
print(X_train_count.shape)
print(X_train_tfidf.shape)

By `print(X_train_count.shape)`, we got to know that 1722012 features were extracted from 25000 documents.

These features represent the vocabulary size (unigrams and/or n-grams) for vectorization.

Now, I will use a pretrained embedding to extract features using Word2Vec

### 3.2 Extracting features with Word Embeddings

First, I will train Word2Vec model on the test set vocabulary (Custom training), which means `w2v` will contain all the trained model parameters, the vocabulary it learned from the test data, and the word vectors of the vocab

In [None]:
# Preprocessing train data for Word2Vec
preprocessed_train_set = [review.split() for review in df_train['Review']]

preprocessed_test_set = [review.split() for review in df_test['Review']]

# Training Word2Vec model
w2v = Word2Vec(sentences=preprocessed_train_set, vector_size=5, window=2, min_count=1, workers=0)

'''# Displaying vector for a word
print(w2v.wv['natural'])'''

In [None]:
# Making a function for averaging vectors in a single doc to obtain document-level vector embeddings
def average_vectors_in_doc(doc, model, vector_size):
  vectors = [model.wv[word] for word in doc if word in model.wv]  # Fetches vector of every word in the doc
  if vectors:
    return np.mean(vectors, axis=0)
  else:
    return np.zeros(vector_size) # If all token removed from review during preprocessing, this line implements

# Now passing every review to this function
X_train_dense = np.array([average_vectors_in_doc(review, w2v, 5) for review in preprocessed_train_set])

X_test_dense = np.array([average_vectors_in_doc(review, w2v, 5) for review in preprocessed_test_set])

In [None]:
print(X_train_dense.shape)
print(X_test_dense.shape)

# 4. Multinomial Naive Bayes model


In [None]:
# Extracting the target labels from the dataframes
y_train = df_train['Label']
y_val = df_val['Label']
y_test = df_test['Label']

### 4.1 Train Naive Bayes on CountVectorizer

In [None]:
# Building the model
nb_classifier_count = MultinomialNB()
nb_classifier_count.fit(X_train_count, y_train)

# Predicting on the test set
y_pred = nb_classifier_count.predict(X_test_count)
accuracy_nb_count = accuracy_score(y_test, y_pred)

print(accuracy_nb_count)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


### 4.3 Train Naive Bayes on IF-IDF

In [None]:
# Building the model
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_count, y_train)

# Predicting on the test set
y_pred = nb_classifier_tfidf.predict(X_test_count)
accuracy_nb_tfidf = accuracy_score(y_test, y_pred)

print(accuracy_nb_tfidf)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


# 5. LinearSVM Model

### 5.1 Train LinearSVM on CountVectorizer

In [None]:
svm_count = LinearSVC()
svm_count.fit(X_train_count, y_train)

y_pred = svm_count.predict(X_test_count)
accuracy_count_svm = accuracy_score(y_test, y_pred)

print("Linear SVM with CountVectorizer:")
print("Accuracy:", accuracy_count_svm)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


### 5.2 Train LinearSVM on TF-IDF

In [None]:
svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)

y_pred = svm_tfidf.predict(X_test_tfidf)
accuracy_tfidf_svm = accuracy_score(y_test, y_pred)

print("Linear SVM with TF-IDF:")
print("Accuracy:", accuracy_tfidf_svm)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

### 5.3 Train LinearSVM with Word2Vec

In [None]:
svm_w2v = LinearSVC()
svm_w2v.fit(X_train_dense, y_train)

y_pred_w2v_svm = svm_w2v.predict(X_test_dense)
accuracy_w2v_svm = accuracy_score(y_test, y_pred_w2v_svm)

print("Linear SVM with Word2Vec:")
print("Accuracy:", accuracy_w2v_svm)
print(classification_report(y_test, y_pred_w2v_svm, target_names=['Negative', 'Positive']))


# 6. Models comparision

In [None]:
results = {
    "Feature Representation": ["Bag of Words", "TF-IDF", "Bag of Words", "TF-IDF", "Word Embeddings"],
    "Model": ["Naive Bayes", "Naive Bayes", "LinearSVM", "LinearSVM", "LinearSVM"],
    "Accuracy Score": [accuracy_nb_count, accuracy_nb_tfidf, accuracy_count_svm, accuracy_tfidf_svm, accuracy_w2v_svm]
}

results_df = pd.DataFrame(results)
print(results_df)


# 7. Analysis and Discussion
**7.1 Compare generative vs. discriminative performance.**

Overall, LinearSVM (the discriminative model) has performed better than Naive Bayes (the generative model). It's accuracy is higher in the sparse representations. However, it has performed very low in the dense vector representation.

**7.2 Discuss how N‑gram size and embedding choice affected results.**