In [4]:
# Data handling
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout, Bidirectional

# Training support
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:

CSV_PATH = "IMDB Dataset.csv"   # e.g., "/content/IMDB Dataset.csv"
df = pd.read_csv(CSV_PATH)
print("✅ CSV loaded:", df.shape)
print(df.head())


✅ CSV loaded: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:


# 2) Minimal cleaning
# ----------------------------
def preprocessing(tweet):
  if not isinstance(tweet, str):
    return ""

  tweet = tweet.lower()
  nopun = "".join([char for char in tweet  if not char in string.punctuation ])

  STOPWORDS = stopwords.words("english") + [
        'u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'  # Social media specific
    ]

  words = [word for word in nopun.split() if word not in STOPWORDS]
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]
  return " ".join(words)

df['review_clean'] = df['review'].astype(str).apply(preprocessing)
df.head()


print("✅ Preprocessing done")
df.head()



✅ Preprocessing done


Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [7]:

binary_mapping = {'positive': 1 , 'negative':0}
df = df[df['sentiment'].isin(binary_mapping.keys())].copy()
df['label']= df['sentiment'].map(binary_mapping)

df_train_data = df["review_clean"]
df_test_lable = df["label"]
print("Class counts:", df['label'].value_counts().to_dict())
print("✅ Labels ready")


Class counts: {1: 25000, 0: 25000}
✅ Labels ready


In [8]:
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df['review_clean'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['label'].values
)

In [9]:
# Replace TF-IDF with tokenization + padding
MAX_WORDS = 30000
MAX_LEN   = 200

tok = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tok.fit_on_texts(X_train_text)

def to_pad(texts):
    return pad_sequences(tok.texts_to_sequences(texts), maxlen=MAX_LEN, padding="post", truncating="post")

X_train = to_pad(X_train_text)
X_val   = to_pad(X_val_text)


In [10]:
# RNN (BiLSTM) model, compile, train, evaluate
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

EMB_DIM = 128
UNITS   = 128
DROPOUT = 0.3

model = Sequential([
    Embedding(input_dim=min(MAX_WORDS, len(tok.word_index)+1), output_dim=EMB_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(UNITS)),
    Dropout(DROPOUT),
    Dense(128, activation="relu"),
    Dropout(DROPOUT),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

cb = [
    EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
    ModelCheckpoint("best_model.keras", monitor="val_loss", save_best_only=True)
]

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=8, batch_size=64, callbacks=cb, verbose=1)

val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"val_acc: {val_acc:.4f}")


Epoch 1/8




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m506s[0m 802ms/step - accuracy: 0.7603 - loss: 0.4758 - val_accuracy: 0.8790 - val_loss: 0.3012
Epoch 2/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 788ms/step - accuracy: 0.9320 - loss: 0.1969 - val_accuracy: 0.8766 - val_loss: 0.3079
Epoch 3/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 780ms/step - accuracy: 0.9571 - loss: 0.1246 - val_accuracy: 0.8838 - val_loss: 0.3573
val_acc: 0.8790


In [11]:
# 1) Save artifacts
import json
model.save("final_model.keras")

tok_json = tok.to_json()
with open("tokenizer.json","w") as f: f.write(tok_json)

classes = {0:"negative", 1:"positive"}
with open("label_map.json","w") as f: json.dump(classes, f)


In [12]:
# 2) Inference helper
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_texts(texts, th=0.5):
    seq = tok.texts_to_sequences(texts)
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding="post", truncating="post")
    prob1 = model.predict(pad).reshape(-1)
    pred  = (prob1 >= th).astype(int)
    labs  = [classes[int(i)] for i in pred]
    conf  = [p if y==1 else 1-p for p,y in zip(prob1,pred)]
    return list(zip(texts, labs, conf))
# Example:
# print(predict_texts(["I loved it!", "Terrible movie."]))


In [13]:
# 3) (Optional) Export predictions for all rows
all_probs = model.predict(X_val).reshape(-1)
all_pred  = (all_probs >= 0.5).astype(int)
out = pd.DataFrame({
    "review": X_val_text,
    "true":   [classes[int(i)] for i in y_val],
    "pred":   [classes[int(i)] for i in all_pred],
    "conf":   [p if y==1 else 1-p for p,y in zip(all_probs, all_pred)]
})
out.to_csv("val_predictions.csv", index=False)
print("Saved: final_model.keras, tokenizer.json, label_map.json, val_predictions.csv")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 162ms/step
Saved: final_model.keras, tokenizer.json, label_map.json, val_predictions.csv
