In [36]:
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("imdb_dataset.csv")  # Update with actual filename
print (len(df))

50000


In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
print (df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [38]:
reviews = list(df["review"])
sentiments = list(df["sentiment"])

In [69]:
positive_reviews = [r for r, s in zip(reviews, sentiments) if s == "positive"]
negative_reviews = [r for r, s in zip(reviews, sentiments) if s == "negative"]

pos_sample_size = int(0.40 * len(positive_reviews))
neg_sample_size = int(0.40 * len(negative_reviews))

sampled_positive = random.sample(positive_reviews, pos_sample_size)
sampled_negative = random.sample(negative_reviews, neg_sample_size)

balanced_reviews = sampled_positive + sampled_negative
balanced_sentiments = ["positive"] * pos_sample_size + ["negative"] * neg_sample_size

combined = list(zip(balanced_reviews, balanced_sentiments))
random.shuffle(combined)
balanced_reviews, balanced_sentiments = zip(*combined)

balanced_reviews = list(balanced_reviews)
balanced_sentiments = list(balanced_sentiments)

print (len(balanced_reviews))
print (len(balanced_sentiments))

20000
20000


In [70]:
X_train, X_temp, y_train, y_temp = train_test_split(balanced_reviews, balanced_sentiments, test_size=0.5, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print (len(X_train), len(y_train))
print (len(X_val), len(y_val))
print (len(X_test), len(y_test))

10000 10000
5000 5000
5000 5000


In [71]:
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

52662


In [105]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=0.005)
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

2963


In [128]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_df=0.95, min_df=0.005)
X_train_tfidf = vectorizer.fit_transform(X_train, )
print (len(vectorizer.vocabulary_))

3389


In [73]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

5000


In [74]:
#vectorizer.vocabulary_

In [129]:
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [130]:
X_val_tfidf

<5000x3389 sparse matrix of type '<class 'numpy.float64'>'
	with 337397 stored elements in Compressed Sparse Row format>

In [131]:
#X_val_tfidf[9999].data

In [132]:
model = LogisticRegression(C=1, max_iter=150, class_weight='balanced')
model.fit(X_train_tfidf, y_train)

LogisticRegression(C=1, class_weight='balanced', max_iter=150)

In [133]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Trainn Accuracy:", val_accuracy)

Trainn Accuracy: 0.9179


In [134]:
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.8756


In [135]:
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8716


In [136]:
model = RandomForestClassifier(n_estimators=30)
model.fit(X_train_tfidf, y_train)

RandomForestClassifier(n_estimators=30)

In [137]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Trainn Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.9998
Validation Accuracy: 0.815
Test Accuracy: 0.8176


In [167]:
model = RandomForestClassifier(n_estimators=1)
model.fit(X_train_tfidf, y_train)

RandomForestClassifier(n_estimators=1)

In [168]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Trainn Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.8692
Validation Accuracy: 0.6492
Test Accuracy: 0.6588
