In [2]:
# Importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annalysegill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Loading dataset
imdb_df = pd.read_csv('IMDB Dataset.csv')
print("\nDataFrame head:")
print(imdb_df.head())


DataFrame head:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
# EDA 
imdb_df.info()
print(imdb_df['sentiment'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [8]:
import nltk
nltk.download('punkt_tab')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Robustly download necessary NLTK data
try:
    stopwords.words('english')
except LookupError:
    print("Downloading 'stopwords' package...")
    nltk.download('stopwords', quiet=True)

try:
    word_tokenize("test sentence")
except LookupError:
    print("Downloading 'punkt' package...")
    nltk.download('punkt', quiet=True)

# Define function and apply it

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply the function and print the result

imdb_df['cleaned_review'] = imdb_df['review'].apply(preprocess_text)
print("\nBefore and After Cleaning")
print(imdb_df[['review', 'cleaned_review']].head())

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/annalysegill/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Before and After Cleaning
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


In [9]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(imdb_df['cleaned_review']).toarray()
y = imdb_df['sentiment']

print("Feature Extraction Complete")
print(f"Shape of the feature matrix (X): {X.shape}")
print(f"Shape of the target vector (y): {y.shape}")

Feature Extraction Complete
Shape of the feature matrix (X): (50000, 5000)
Shape of the target vector (y): (50000,)


In [10]:
# Split and train classification model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=11)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.8880

Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.89      0.89      5000
    positive       0.89      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [11]:
# Initialize and train the MNB model
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)

In [12]:
# Make predictions and evaluate
print("Multinomial Naive Bayes Evaluation")
y_pred_mnb = mnb_model.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

print(f"Model Accuracy: {accuracy_mnb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mnb))

Multinomial Naive Bayes Evaluation
Model Accuracy: 0.8514

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.86      0.85      5000
    positive       0.85      0.85      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [13]:
# LGBM: Initialize and train the LGBM model
lgbm_model = LGBMClassifier(random_state=11)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 20000, number of negative: 20000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.229348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 582063
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [14]:
# Make predictions and evaluate
print("\nLightGBM Classifier Evaluation")
y_pred_lgbm = lgbm_model.predict(X_test)
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)

print(f"Model Accuracy: {accuracy_lgbm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgbm))


LightGBM Classifier Evaluation
Model Accuracy: 0.8602

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.85      0.86      5000
    positive       0.85      0.87      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [15]:
# Define parameters
param_dist = {
    'n_estimators': randint(50, 500),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': randint(20, 100),
    'max_depth': [-1, 10, 20, 30],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0.1, 0.5, 1]
}

In [38]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    lgbm_model,
    param_distributions=param_dist,
    n_iter=25,
    cv=3,
    scoring='accuracy',
    random_state=11,
    n_jobs=-1,
    verbose=2
)

In [34]:
# Print the best parameters and score
print(f"\nBest Parameters Found: {random_search.best_params_}")
print(f"Best Cross-validation Accuracy: {random_search.best_score_:.4f}")


Best Parameters Found: {'learning_rate': 0.2, 'max_depth': 30, 'n_estimators': 376, 'num_leaves': 30, 'reg_alpha': 0.5, 'reg_lambda': 0.1}
Best Cross-validation Accuracy: 0.8758


In [35]:
# Evaluate the best model on the test set
print("\nTuned LightGBM Classifier Evaluation")
best_model = random_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

print(f"Test Set Accuracy: {accuracy_tuned:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tuned))



Tuned LightGBM Classifier Evaluation
Test Set Accuracy: 0.8778

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.88      0.88      5000
    positive       0.88      0.88      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

