In [1]:
# Reading the CSV file
import pandas as pd
df = pd.read_csv('IMDB_Dataset.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [2]:
# Checking for Null values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [3]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Step 1: Remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Step 2: Convert to Lowercase
def to_lowercase(text):
    return text.lower()

# Step 3: Remove special characters, punctuation, and digits
def remove_special_chars(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Step 4: Tokenization
nltk.download('punkt')
def tokenize(text):
    return nltk.word_tokenize(text)

# Step 5: Remove stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Step 6: Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


# Full Cleaning Pipeline
def preprocess_text(text):
    text = remove_html_tags(text)
    text = to_lowercase(text)
    text = remove_special_chars(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return ' '.join(tokens)

df['review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akhilenderk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akhilenderk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akhilenderk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/akhilenderk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  return BeautifulSoup(text, "html.parser").get_text()


In [5]:
df.head(5)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    stop_words='english'  # Extra stopword filtering
)

# Fit and transform the cleaned text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])

# Display the TF-IDF matrix shape (documents, features)
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")


TF-IDF Matrix Shape: (50000, 5000)


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X=tfidf_matrix
y=df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
logistic_model = LogisticRegression()

# Define hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers that support regularization
    'max_iter': [100, 200, 300]  # Number of iterations
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=logistic_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Use accuracy to evaluate model performance
    n_jobs=-1,  # Use all available CPUs for parallel processing
    verbose=1  # Display progress
)

# Step 1: Train with GridSearchCV
grid_search.fit(X_train, y_train)

# Step 2: Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Step 3: Best Model Performance
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Step 4: Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy_best:.4f}")

# Print Classification Report for the best model
print("\nBest Model Classification Report:")
print(classification_report(y_test, y_pred_best, target_names=['negative', 'positive']))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Hyperparameters: {'C': 1, 'max_iter': 300, 'solver': 'saga'}
Best Model Accuracy: 0.8820

Best Model Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.87      0.90      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

