In [50]:
import pandas as pd
import numpy as np 
import re

In [51]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # text = re.sub('[^a-zA-Z0-9]',' ', messages['message'][i])
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and perform lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
from joblib import Parallel, delayed
df = pd.read_csv('IMDB Dataset.csv')
df['review'] = df['review'].apply(lambda x: x.encode('utf-8', errors='ignore').decode('utf-8'))

# Rename columns and map sentiments to integers
# df.columns = ['review', 'sentiment']
# df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Sample a smaller subset for quick testing (optional)
df_sample = df.sample(n=1000, random_state=42)

# Apply preprocessing in parallel
df_sample['review'] = Parallel(n_jobs=-1)(delayed(preprocess_text)(text) for text in df_sample['review'])

In [53]:

print(df['sentiment_encoded'].unique())


[1 0]


In [54]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.2, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_tfidf, y_train)

# Predictions
y_pred = clf.predict(X_test_tfidf)

# Evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

