# Loading and Analysing dataset

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')  


print("Dataset Info:")
print(data.info())

# Display sentiment distribution
print("\nSentiment Counts:")
print(data['sentiment'].value_counts())

# Preview the first few rows
print("\nFirst 5 rows:")
print(data.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None

Sentiment Counts:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64

First 5 rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
# Import necessary libraries for data handling, text preprocessing, and modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download NLTK resources (stopwords and tokenizer)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# preprosessing 

In [3]:
# Define a function to preprocess text
def preprocess_text(text):
    text = text.lower()
    
    # Remove HTML tags (common in IMDb reviews)
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply preprocessing to the 'review' column
data['cleaned_review'] = data['review'].apply(preprocess_text)

# Display a sample of original vs cleaned text
print("Sample of original vs cleaned text:")
print(data[['review', 'cleaned_review']].head())

Sample of original vs cleaned text:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


# Splitting

In [4]:
# Define features (X) and target (y)
X = data['cleaned_review']
y = data['sentiment']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the splits
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 40000
Testing set size: 10000


# Vectorize the Text Using TF-IDF

In [5]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the vectorized data
print("Training data shape after TF-IDF:", X_train_tfidf.shape)
print("Testing data shape after TF-IDF:", X_test_tfidf.shape)

Training data shape after TF-IDF: (40000, 5000)
Testing data shape after TF-IDF: (10000, 5000)


# Training the Naïve Bayes Model

In [6]:
# Initialize the Multinomial Naïve Bayes model
nb_model = MultinomialNB()

# Train the model on the TF-IDF transformed training data
nb_model.fit(X_train_tfidf, y_train)

print("Naïve Bayes model training completed!")

Naïve Bayes model training completed!


# Evaluation 

In [7]:
# Make predictions on the test set
y_pred = nb_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate F1-score
f1 = f1_score(y_test, y_pred, pos_label='positive')  # Specify 'positive' as the positive label
print("F1-Score:", f1)

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8497
F1-Score: 0.8511733835033172

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# Tuning Naïve Bayes with GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for alpha
param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}

# Initialize Naïve Bayes model
nb_model = MultinomialNB()

# Perform grid search with cross-validation
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Print best parameters and score
print("Best alpha:", grid_search.best_params_)
print("Best F1-score:", grid_search.best_score_)

# Use the best model
best_nb_model = grid_search.best_estimator_
y_pred = best_nb_model.predict(X_test_tfidf)
print("Updated Accuracy:", accuracy_score(y_test, y_pred))
print("Updated F1-Score:", f1_score(y_test, y_pred, pos_label='positive'))



Best alpha: {'alpha': 0.01}
Best F1-score: nan
Updated Accuracy: 0.8484
Updated F1-Score: 0.850345508390918


# Saving model

In [12]:
import pickle

# Save the tuned model using pickle
try:
    with open('/kaggle/working/best_nb_model.pkl', 'wb') as f:
        pickle.dump(best_nb_model, f)
    print("Tuned Naïve Bayes model saved as '/kaggle/working/best_nb_model.pkl'")
    with open('/kaggle/working/tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    print("TF-IDF vectorizer saved as '/kaggle/working/tfidf_vectorizer.pkl'")
except Exception as e:
    print(f"Error saving: {e}")

Tuned Naïve Bayes model saved as '/kaggle/working/best_nb_model.pkl'
TF-IDF vectorizer saved as '/kaggle/working/tfidf_vectorizer.pkl'


# Simple Prediction Function for User Input

In [14]:
import pickle

# Load the saved model and vectorizer using pickle
try:
    with open('/kaggle/working/best_nb_model.pkl', 'rb') as f:
        best_nb_model = pickle.load(f)
    print("Loaded the tuned Naïve Bayes model from '/kaggle/working/best_nb_model.pkl'")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

try:
    with open('/kaggle/working/tfidf_vectorizer.pkl', 'rb') as f:
        tfidf_vectorizer = pickle.load(f)
    print("Loaded the TF-IDF vectorizer from '/kaggle/working/tfidf_vectorizer.pkl'")
except Exception as e:
    print(f"Error loading vectorizer: {e}")
    raise

# Define a function to predict sentiment for a new review
def predict_sentiment(review):
    # Preprocess the input review
    cleaned_review = preprocess_text(review)
    
    # Transform the review using the trained TF-IDF vectorizer
    review_tfidf = tfidf_vectorizer.transform([cleaned_review])
    
    # Predict sentiment using the tuned model
    prediction = best_nb_model.predict(review_tfidf)[0]
    
    return prediction

# Test the function with an example
sample_review = "This movie was absolutely amazing and I loved every second of it!"
predicted_sentiment = predict_sentiment(sample_review)
print(f"Sample Review: '{sample_review}'")
print(f"Predicted Sentiment: {predicted_sentiment}")

# Another test with a negative review
sample_review2 = "Terrible waste of time, I hated this film."
predicted_sentiment2 = predict_sentiment(sample_review2)
print(f"\nSample Review: '{sample_review2}'")
print(f"Predicted Sentiment: {predicted_sentiment2}")

Loaded the tuned Naïve Bayes model from '/kaggle/working/best_nb_model.pkl'
Loaded the TF-IDF vectorizer from '/kaggle/working/tfidf_vectorizer.pkl'
Sample Review: 'This movie was absolutely amazing and I loved every second of it!'
Predicted Sentiment: positive

Sample Review: 'Terrible waste of time, I hated this film.'
Predicted Sentiment: negative
