In [5]:
!pip install datasets scikit-learn numpy nltk gensim

Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.4 MB/s[0m eta [36m0:0

In [13]:
from datasets import load_dataset
import numpy as np
import nltk
nltk.download('punkt')

# Load the full train split and shuffle it
dataset = load_dataset("imdb", split="train").shuffle(seed=42)

# Select a balanced subset of 5000 examples after shuffling
dataset = dataset.select(range(5000))
texts = np.array(dataset['text'])
labels = np.array(dataset['label'])

# Check overall label distribution in the subset
print("Overall classes:", np.unique(labels))
print("Overall class distribution:", np.bincount(labels))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Overall classes: [0 1]
Overall class distribution: [2494 2506]


In [14]:
from sklearn.model_selection import train_test_split

# Stratified splitting to ensure both classes are present in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Verify that both classes are now present in the training set
print("Training classes:", np.unique(y_train))
print("Training class distribution:", np.bincount(y_train))
print("Test classes:", np.unique(y_test))
print("Test class distribution:", np.bincount(y_test))


Training classes: [0 1]
Training class distribution: [1995 2005]
Test classes: [0 1]
Test class distribution: [499 501]


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Vectorize text using Bag-of-Words
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train logistic regression model
lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(X_train_bow, y_train)

# Evaluate the model
y_pred_bow = lr_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("BOW Model Accuracy:", accuracy_bow)
print(classification_report(y_test, y_pred_bow))


BOW Model Accuracy: 0.823
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       499
           1       0.81      0.84      0.83       501

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [17]:
import gensim.downloader as api
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

# Load pretrained GloVe embeddings (100-dimensional)
embedding_model = api.load("glove-wiki-gigaword-100")
embedding_dim = embedding_model.vector_size

# Function to compute the average embedding for a document
def document_embedding(doc):
    tokens = word_tokenize(doc.lower())
    embeddings = [embedding_model[word] for word in tokens if word in embedding_model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

# Generate CBOW embeddings for train and test data
X_train_cbow = np.array([document_embedding(doc) for doc in X_train])
X_test_cbow = np.array([document_embedding(doc) for doc in X_test])

# Check shapes to confirm dimensions
print("Training CBOW features shape:", X_train_cbow.shape)
print("Test CBOW features shape:", X_test_cbow.shape)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Training CBOW features shape: (4000, 100)
Test CBOW features shape: (1000, 100)


In [18]:
# Train logistic regression on CBOW embeddings
lr_cbow = LogisticRegression(max_iter=1000)
lr_cbow.fit(X_train_cbow, y_train)

# Evaluate the CBOW model
y_pred_cbow = lr_cbow.predict(X_test_cbow)
accuracy_cbow = accuracy_score(y_test, y_pred_cbow)
print("CBOW Model Accuracy:", accuracy_cbow)
print(classification_report(y_test, y_pred_cbow))


CBOW Model Accuracy: 0.785
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       499
           1       0.79      0.77      0.78       501

    accuracy                           0.79      1000
   macro avg       0.79      0.79      0.78      1000
weighted avg       0.79      0.79      0.78      1000

