importing necessary libs

In [8]:
from datasets import load_dataset


# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ajaykarthick/imdb-movie-reviews")

Info on the dataset
Display how many 1 and 0 in label column

In [9]:
print(ds)

#use the command to tell me how many 0 and 1 are in the label column of the training set
train_labels = ds['train']['label']
num_zeros = train_labels.count(0)
num_ones = train_labels.count(1)
print(f"Number of 0s: {num_zeros}")
print(f"Number of 1s: {num_ones}")

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 10000
    })
})
Number of 0s: 20000
Number of 1s: 20000


preprocessing for Sentiment analysis

In [10]:
import re
#remove HTML tags
def remove_html_tags(text):
    """Removes common HTML break tags from the 'review' column."""
    
    # Use re.sub to replace the pattern with an empty string
    # The pattern r'<br\s*/?>' matches <br>, <br/>, or <br /> (case-insensitive)
    CLEANR = re.compile(r'<br\s*/?>', re.IGNORECASE)
    
    text["review"] = re.sub(CLEANR, ' ', text["review"])
    
    return text

clear_output = ds['train'].map(remove_html_tags)

Printing the first row to check if preprocessing working

In [11]:
#Display the first example from the cleaned training set
print(clear_output[0])

{'review': "Ms Aparna Sen, the maker of Mr & Mrs Iyer, directs this movie about a young girl's struggle to cope with her debilitating condition.  Meethi (Konkona Sen) has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. The dormant tendency however slips out of control, when the job assignment takes her to neighboring Bihar where she's raped by some political goons. The resulting trauma also leads to episodes of manic-depressive psychosis in addition to her schizophrenia. She careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'.  The juxtaposition of an 'unsettled' (divorced) elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and *seemingly normal*. Ms Sen also makes an excellent commentary on the social alienation of such individuals. Social rehab is standard therapy along with all the deadly mind-alt

Remove punctuations lower case everything

In [12]:
def simple_text_cleaning(Review):
    """
    Converts text to lowercase and removes punctuation/special characters
    for the 'review' column.
    """
    text = Review["review"]
    
    # Convert to lowercase
    text = text.lower()

    # Remove Punctuation & Special Characters
    # The pattern r'[^a-z0-9\s]' matches anything that is NOT a letter, number, or space (not gonna lie this came from AI ,I did it the long way but Ai told me there is a faster and more efficient way to do it so I used it)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    # Update the 'review' column in the Review dictionary
    Review["review"] = text
    
    return Review

cleaned_dataset = clear_output.map(simple_text_cleaning)
#Display the first example to test it but commented out to reduce output
#print(cleaned_dataset[0])

now Time for vectorization

In [27]:
# TF-IDF vectorization with unigrams + bigrams for Naive Bayes
from sklearn.feature_extraction.text import TfidfVectorizer


# Use the cleaned texts produced earlier (you've already removed HTML, punctuation, and lowercased)
train_texts = [ex['review'] for ex in cleaned_dataset]
y_train = [ex['label'] for ex in cleaned_dataset]

# Prepare test texts using the same preprocessing steps
cleaned_test = ds['test'].map(remove_html_tags).map(simple_text_cleaning)
test_texts = [ex['review'] for ex in cleaned_test]
y_test = [ex['label'] for ex in cleaned_test]

# Create TF-IDF vectorizer with ngrams (1,2). Tune max_features as needed.
tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf_vect.fit_transform(train_texts)
X_test_tfidf = tfidf_vect.transform(test_texts)

print('TF-IDF train shape:', X_train_tfidf.shape)
print('TF-IDF test shape :', X_test_tfidf.shape)






TF-IDF train shape: (40000, 20000)
TF-IDF test shape : (10000, 20000)


In [None]:
#
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


# Train MultinomialNB classifier
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_tfidf, y_train)

# Predictions and metrics on test set
y_pred = nb.predict(X_test_tfidf)
test_acc = accuracy_score(y_test, y_pred)

print('\nClassification report (test):')
print(classification_report(y_test, y_pred))
print('\nConfusion matrix (test):')
print(confusion_matrix(y_test, y_pred))

#training set performance
y_pred_train = nb.predict(X_train_tfidf)
train_acc = accuracy_score(y_train, y_pred_train)
print(f'Naive Bayes Test accuracy: {test_acc:.4f}')
print(f'Naive Bayes Train accuracy: {train_acc:.4f}')



Classification report (test):
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      5000
           1       0.88      0.86      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000


Confusion matrix (test):
[[4421  579]
 [ 709 4291]]
Naive Bayes Test accuracy: 0.8712
Naive Bayes Train accuracy: 0.8919


Now we chose use Logistic

In [None]:
# Train and evaluate Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer

# Prepare training data from cleaned_dataset 
train_texts = [ex['review'] for ex in cleaned_dataset]
y_train = [ex['label'] for ex in cleaned_dataset]

cleaned_test = ds['test'].map(remove_html_tags).map(simple_text_cleaning)
test_texts = [ex['review'] for ex in cleaned_test]
y_test = [ex['label'] for ex in cleaned_test]

# Vectorize using Bag-of-Words (CountVectorizer) with unigrams + bigrams
vectorizer = CountVectorizer(ngram_range=(1,2), max_features=20000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)


print('Train matrix shape:', X_train.shape)
print('Test matrix shape:', X_test.shape)

# Train Logistic Regression (increase max_iter if convergence warnings appear)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate on test set
y_pred_test = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
print(f'Test accuracy: {test_acc:.4f}')
print('\nClassification report (test):')
print(classification_report(y_test, y_pred_test))
print('\nConfusion matrix (test):')
print(confusion_matrix(y_test, y_pred_test))

# Optional: evaluate on training set to check for overfitting
y_pred_train = model.predict(X_train)
train_acc = accuracy_score(y_train, y_pred_train)
print(f'Train accuracy: {train_acc:.4f}')



Train matrix shape: (40000, 20000)
Test matrix shape: (10000, 20000)
Test accuracy: 0.8909

Classification report (test):
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      5000
           1       0.89      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Confusion matrix (test):
[[4475  525]
 [ 566 4434]]
Train accuracy: 0.9997


In [None]:
# Train an LSTM classifier using Word2Vec embeddings as pretrained weights
# This uses the Word2Vec model trained on the training tokens and a Keras LSTM for binary classification.
from gensim.models import Word2Vec
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Prepare texts and labels (use cleaned_dataset from earlier)
train_texts = [ex['review'] for ex in cleaned_dataset]
y_train = np.array([ex['label'] for ex in cleaned_dataset])

# Clean and prepare test split similarly
cleaned_test = ds['test'].map(remove_html_tags).map(simple_text_cleaning)
test_texts = [ex['review'] for ex in cleaned_test]
y_test = np.array([ex['label'] for ex in cleaned_test])

# Tokenize texts (we'll use Keras Tokenizer to convert to sequences)
max_words = 20000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)
vocab_size = min(max_words, len(tokenizer.word_index)) + 1

# Convert texts to padded sequences
train_seqs = tokenizer.texts_to_sequences(train_texts)
test_seqs = tokenizer.texts_to_sequences(test_texts)
maxlen = 200
X_train_seq = pad_sequences(train_seqs, maxlen=maxlen, padding='post', truncating='post')
X_test_seq = pad_sequences(test_seqs, maxlen=maxlen, padding='post', truncating='post')

# If a Word2Vec model isn't already trained in this session, train one on training tokens
train_tokens = [txt.split() for txt in train_texts]
w2v_params = dict(vector_size=100, window=5, min_count=2, workers=4, epochs=5)
w2v = Word2Vec(sentences=train_tokens, **w2v_params)
embed_dim = w2v.vector_size

# Build embedding matrix aligned with tokenizer's word_index
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    if word in w2v.wv.key_to_index:
        embedding_matrix[i] = w2v.wv[word]
    # otherwise row stays zeros (OOV)

# Build LSTM model with pretrained embeddings (trainable=True to fine-tune)
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, weights=[embedding_matrix], input_length=maxlen, trainable=True),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid'),
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks: early stopping and best-model checkpoint
es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
mc = ModelCheckpoint('best_lstm_w2v.h5', monitor='val_loss', save_best_only=True)

# Train the model (small epochs by default; increase for better performance)
history = model.fit(X_train_seq, y_train, epochs=6, batch_size=64, validation_split=0.1, callbacks=[es, mc])

# Load best model and evaluate on test set
model.load_weights('best_lstm_w2v.h5')
y_pred_prob = model.predict(X_test_seq, batch_size=64).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print(f'Test accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('\nClassification report (test):')
print(classification_report(y_test, y_pred))
print('\nConfusion matrix (test):')
print(confusion_matrix(y_test, y_pred))



ddddd

Test accuracy: 0.8862

Classification report (test):
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Confusion matrix (test):
[[4359  641]
 [ 497 4503]]
Train accuracy: 0.9674
Test accuracy: 0.8862
