Install & Import Required Libraries

In [79]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score


Load IMDb Dataset

In [64]:
from sklearn.datasets import load_files

reviews_df = pd.read_csv("/content/IMDB Dataset.csv")
reviews_df.head()



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Convert Sentiment Labels to Numeric Format


In [65]:
# Convert 'positive' to 1 and 'negative' to 0
reviews_df['review'] = reviews_df['sentiment'].map({'positive': 1, 'negative': 0})

# Drop original sentiment column
reviews_df.drop(columns=['sentiment'], inplace=True)

# Show updated dataset
reviews_df.head()


Unnamed: 0,review
0,1
1,1
2,1
3,0
4,1


Text Preprocessing

In [66]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub("\d+", "", text)  # Remove numbers
    return text

# Apply preprocessing
reviews_df['cleaned_text'] = reviews_df['review']  # Use the actual column name


# Display processed data
reviews_df.head()


Unnamed: 0,review,cleaned_text
0,1,1
1,1,1
2,1,1
3,0,0
4,1,1


Split Data into Training and Testing Sets

In [67]:
X_train, X_test, y_train, y_test = train_test_split(reviews_df['cleaned_text'], reviews_df['review'], test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Training size: {len(X_train)}, Testing size: {len(X_test)}")


Training size: 40000, Testing size: 10000


In [68]:
print(X_train.head())  # Show first few rows
print(X_train.shape)   # Check if it has any data


39087    0
30893    0
45278    1
16398    0
13653    0
Name: cleaned_text, dtype: int64
(40000,)


In [69]:
X_train = X_train.fillna("").astype(str)
X_test = X_test.fillna("").astype(str)


 Convert Text Data into Numerical Features

In [70]:
print(X_train.sample(5))  # Check some samples


28172    1
24572    1
20244    0
39050    1
24428    1
Name: cleaned_text, dtype: object


In [71]:
X_train = X_train.fillna("").astype(str)
X_test = X_test.fillna("").astype(str)


In [72]:
print(X_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: cleaned_text, Length: 40000, dtype: object


In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split

X_train = X_train.astype(str)  # Ensure data is strings
X_test = X_test.astype(str)

# One-hot encode the categorical data
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Now use X_train_encoded and X_test_encoded for your model training

In [74]:
print(X_train.dtypes)  # Check data types of each column
print(X_train.head())  # Show first few rows


object
39087    0
30893    0
45278    1
16398    0
13653    0
Name: cleaned_text, dtype: object


In [75]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)


Train Models and Evaluate

In [84]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC()
}

for name, model in models.items():
    model.fit(X_train_encoded, y_train)  # Train the model
    y_pred = model.predict(X_test_encoded)  # Predict on test data

    # Calculate performance metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"[{name}] Accuracy: {acc:.4f}, F1 score: {f1:.4f}")

[Logistic Regression] Accuracy: 1.0000, F1 score: 1.0000
[Naive Bayes] Accuracy: 1.0000, F1 score: 1.0000
[SVM] Accuracy: 1.0000, F1 score: 1.0000


Create a Simple Interface

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from ipywidgets import interact, Dropdown, Output, fixed
import numpy as np

# Assuming X_train, X_test, y_train, y_test are your original data

# Ensure data is strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# One-hot encode the categorical data
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Reduce training data size
train_size = int(len(X_train_encoded) * 0.5)  # Use 50% of the training data
X_train_reduced = X_train_encoded[:train_size]
y_train_reduced = y_train[:train_size]

def train_and_evaluate(model_name):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Naive Bayes": MultinomialNB(),
        "SVM": SVC()
    }

    model = models[model_name]
    model.fit(X_train_reduced, y_train_reduced)  # Train on reduced data
    y_pred = model.predict(X_test_encoded)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    output.clear_output()
    with output:
        print(f"[{model_name}] Accuracy: {acc:.4f}, F1 score: {f1:.4f}")

model_dropdown = Dropdown(options=["Logistic Regression", "Naive Bayes", "SVM"], description="Select Model:")
output = Output()
interact(train_and_evaluate, model_name=model_dropdown)
output

interactive(children=(Dropdown(description='Select Model:', options=('Logistic Regression', 'Naive Bayes', 'SV…

Output()