In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df=pd.read_csv('data.csv')

In [7]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [8]:
# Display basic info
print("\nDataset Info:")
print(df.info())
print("\nFirst 5 Rows:")
print(df.head())
print("\nColumn Names:")
print(df.columns.tolist())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None

First 5 Rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Column Names:
['review', 'sentiment']


In [9]:
# Check sentiment distribution
print("\nSentiment Distribution:")
try:
    print(df['sentiment'].value_counts())
except KeyError:
    print("Error: 'sentiment' column not found.")
    print("Available columns:", df.columns.tolist())
    print("Please update the code to use the correct column name for sentiment (e.g., 'label').")


Sentiment Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [10]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
review       0
sentiment    0
dtype: int64


In [11]:
# Basic statistics for text length (to understand review lengths)
print("\nReview Length Statistics:")
df['review_length'] = df['review'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)
print(df['review_length'].describe())


Review Length Statistics:
count    50000.000000
mean       231.156940
std        171.343997
min          4.000000
25%        126.000000
50%        173.000000
75%        280.000000
max       2470.000000
Name: review_length, dtype: float64


In [14]:
# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Added for tokenizer
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Convert sentiment to binary (1=positive, 0=negative)
def convert_sentiment(label):
    return 1 if str(label).lower() in ['positive', 'pos', '1'] else 0

# Clean review text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>|[^a-z\s]', '', text)
    tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(text) if w not in stop_words]
    return ' '.join(tokens)

# Process data
print("Processing data...")
df['sentiment'] = df['sentiment'].apply(convert_sentiment)
df['cleaned_review'] = df['review'].apply(clean_text)

# Check results
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())
print("\nSample Raw vs. Cleaned:")
print(df[['review', 'cleaned_review']].head())
print(f"\nEmpty Cleaned Reviews: {(df['cleaned_review'] == '').sum()}")

# Save sample
df[['review', 'cleaned_review', 'sentiment']].head(5).to_csv('cleaned_sample.csv', index=False)
print("Sample saved to 'cleaned_sample.csv'")

Processing data...

Sentiment Distribution:
sentiment
1    25000
0    25000
Name: count, dtype: int64

Sample Raw vs. Cleaned:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  

Empty Cleaned Reviews: 0
Sample saved to 'cleaned_sample.csv'


In [15]:
# Save full cleaned dataset
df.to_csv('cleaned_imdb_reviews.csv', index=False)
print("Full cleaned dataset saved to 'cleaned_imdb_reviews.csv'")

# Preprocessing parameters
max_features = 10000  # Vocabulary size
maxlen = 200  # Max review length

# Tokenize and convert to sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_review'])
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])

# Pad sequences
X = pad_sequences(sequences, maxlen=maxlen)
y = df['sentiment'].values

# Train/test split
if 'split' in df.columns:
    X_train = X[df['split'] == 'train']
    X_test = X[df['split'] == 'test']
    y_train = y[df['split'] == 'train']
    y_test = y[df['split'] == 'test']
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Check shapes
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
print("Training label distribution:", pd.Series(y_train).value_counts().to_dict())
print("Test label distribution:", pd.Series(y_test).value_counts().to_dict())

Full cleaned dataset saved to 'cleaned_imdb_reviews.csv'
Training set shape: (40000, 200) (40000,)
Test set shape: (10000, 200) (10000,)
Training label distribution: {1: 20000, 0: 20000}
Test label distribution: {0: 5000, 1: 5000}


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [18]:
# Build model
model = Sequential([
    Embedding(10000, 128),  # Removed input_length
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model
print("Training model...")
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate
print("\nEvaluating model...")
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")

# Sample prediction
sample_pred = model.predict(X_test[:5], verbose=0)
print("\nSample Predictions (0=neg, 1=pos):")
print((sample_pred > 0.5).astype(int).flatten())
print("Actual Labels:", y_test[:5])

Training model...
Epoch 1/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 112ms/step - accuracy: 0.7641 - loss: 0.4803 - val_accuracy: 0.8763 - val_loss: 0.3206
Epoch 2/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 98ms/step - accuracy: 0.9137 - loss: 0.2346 - val_accuracy: 0.8816 - val_loss: 0.3098
Epoch 3/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 102ms/step - accuracy: 0.9400 - loss: 0.1638 - val_accuracy: 0.8763 - val_loss: 0.3306

Evaluating model...
Test Accuracy: 0.8776

Sample Predictions (0=neg, 1=pos):
[0 0 1 0 0]
Actual Labels: [0 0 1 0 0]
