In [1]:
# ----------------------------
# Step 0: Install Requirements
# ----------------------------
# Run this in your environment if needed:
# !pip install nltk tensorflow

# ----------------------------
# Step 1: Import Libraries
# ----------------------------
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# ----------------------------
# Step 2: Load Dataset
# ----------------------------
df = pd.read_csv("dataset1.csv")

# ----------------------------
# Step 3: Text Preprocessing
# ----------------------------
def preprocess_text(text):
    if isinstance(text, float):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    return " ".join(words)

df["cleaned_review"] = df["review"].fillna("").apply(preprocess_text)
df = df[df["cleaned_review"].str.strip() != ""]  # Remove empty rows

# Debug
print("✅ Number of reviews:", len(df))
print("✅ Sample cleaned reviews:\n", df["cleaned_review"].head())

# ----------------------------
# Step 4: Label Encoding
# ----------------------------
df["sentiment"] = df["sentiment"].str.strip()
df["sentiment"] = df["sentiment"].map({"Positive": 1, "Negative": 0})
df = df.dropna(subset=["sentiment"])
y = df["sentiment"].astype(int)

# ----------------------------
# Step 5: Tokenization & Padding
# ----------------------------
X = df["cleaned_review"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding to fixed length
maxlen = 100  # You can adjust based on average review length
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# ----------------------------
# Step 6: Build CNN Model
# ----------------------------
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ----------------------------
# Step 7: Train the CNN Model
# ----------------------------
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.1)

# ----------------------------
# Step 8: Evaluate the Model
# ----------------------------
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

acc = accuracy_score(y_test, y_pred)
print(f"\n🔹 CNN Model Accuracy: {acc:.4f}")
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to C:\Users\CHRISTIN
[nltk_data]     SANTHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Number of reviews: 10673
✅ Sample cleaned reviews:
 0    too shity it is incredible that this fucking s...
1    w game but bad on steam version nonsteam bette...
2    why is this game so boring there are no bots i...
3    games peace is too slow and weapons are too in...
5    the game is great but standstill crouching wit...
Name: cleaned_review, dtype: object




Epoch 1/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 31ms/step - accuracy: 0.8502 - loss: 0.4309 - val_accuracy: 0.8876 - val_loss: 0.2911
Epoch 2/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.8941 - loss: 0.2742 - val_accuracy: 0.8923 - val_loss: 0.2641
Epoch 3/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.9309 - loss: 0.1915 - val_accuracy: 0.8899 - val_loss: 0.2732
Epoch 4/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.9478 - loss: 0.1521 - val_accuracy: 0.8899 - val_loss: 0.2931
Epoch 5/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.9584 - loss: 0.1228 - val_accuracy: 0.8876 - val_loss: 0.3707
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step  

🔹 CNN Model Accuracy: 0.8848

🔹 Classification Report:
               precision    recall  f1-score   support

 