In [24]:
import mlflow
import pandas as pd 
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re 
import string 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import numpy as np

## DATASET DOWNLOAD

LINK TO DATASET: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

*NOTE: Put downloaded csv file into notebooks directory*

In [None]:
df = pd.read_csv("IMDB.csv")
df.head()

In [26]:
df.shape

(50000, 2)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [28]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [29]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [30]:
int(df.duplicated().sum())

418

In [31]:
df.drop_duplicates(inplace=True)

In [32]:
df.shape

(49582, 2)

In [33]:
print(df.sample(1)["review"].values)

['Alright, I\'m 12, so this is where you get to see the movie from a pre-teen\'s point of view. I\'ve also commented on Magnolia and Bicentennial Man, both great movies, if you want to check it out. Alright, Here on Earth was a beautiful movie with astounding scenes and images, very pleasing to the eye. The writer (I don\'t know who it was, check IMDB) either worked very hard or has a good appreciation for love, poetry, and drama. I cried 4 times throughout this movie, once for over 30 minutes. It was really sad, really beautiful, really meaningful. IT\'s a great movie for anyone, say, 11 and up who isn\'t a romantic-comedy freak. Yeah, it\'s romantic, yeah, it\'s comedic, but (in my opinion), it\'s better than "She\'s All That" or "Whatever it Takes". I never cry! It\'s a tender story. Go rent it and tape it :).']


### Data Preprocessing

In [34]:
def lemmatization(text):
    """Lemmatize the text"""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text"""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def remove_numbers(text):
    """Remove numbers from the text"""
    text = ''.join([char for char in text if not char.isdigit()])
    return text 

def lower_case(text):
    """Convert text to lower case"""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def remove_punctuation(text):
    """Remove punctuation from text"""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace(':', '')
    text = re.sub('\s+', ' ', text).strip()
    return text 

def remove_urls(text):
    """Remove URLS from text"""
    url_pattern = re.compile(r"https?://\s+|www\.\s+")
    return url_pattern.sub(r'', text)

def normalize_text(df):
    "Normalize the text data"
    try:
        df["review"] = df["review"].apply(lower_case)
        df["review"] = df["review"].apply(remove_stop_words)
        df["review"] = df["review"].apply(remove_numbers)
        df["review"] = df["review"].apply(remove_punctuation)
        df["review"] = df["review"].apply(remove_urls)
        df["review"] = df["review"].apply(lemmatization)
        return df 
    except Exception as e:
        print(f"Error during text normalization: {e}")
        raise



In [35]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production br br the filming ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there s family little boy jake think...,negative
4,petter mattei s love time money visually stunn...,positive


In [36]:
# Map sentiment column to integers
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,1
1,wonderful little production br br the filming ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there s family little boy jake think...,0
4,petter mattei s love time money visually stunn...,1


In [37]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [38]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
mlflow.set_tracking_uri("Enter your MLFLOW TRACKING URI HERE")

In [21]:
import dagshub
dagshub.init(repo_owner='AmmanSajid1', repo_name='End-to-End-Movie-Review-Sentiment-Analysis', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=32c2a4b2-50e7-4fbc-b2a8-2faacd0f33bc&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b94622e0d558481d3878f689a56af338aa9e0e32f0ba0e5a9bf88ba5cc59969b




Output()

In [22]:
mlflow.set_experiment("Logistic Regression Baseline")

2025/04/26 15:58:29 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/9427893c19f942ac8b7d578079d78ab5', creation_time=1745679509467, experiment_id='0', last_update_time=1745679509467, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [44]:
import mlflow
import logging
import os 
import time 

# COnfigure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logging.info("Starting MLFlow run...")

with mlflow.start_run():
    start_time = time.time()

    try:
        logging.info("Logging preprocessing parameters...")
        mlflow.log_param("vectorizer", "Bag of Words")
        mlflow.log_param("max_features", 100)
        mlflow.log_param("test_size", 0.2)

        logging.info("Initializing Logistic Regression Model")
        model = LogisticRegression(max_iter=1000)

        logging.info("Fitting the model")
        model.fit(X_train, y_train)
        logging.info("Model training complete.")

        logging.info("Logging model paramaters...")
        mlflow.log_param("model", "Logisitic Regression")

        logging.info("Making predictions...")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info("Logging evaluation metrics")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model, "model")

        # Log execution time
        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds")

        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)



2025-04-26 16:17:03,474 - INFO - Starting MLFlow run...
2025-04-26 16:17:03,669 - INFO - Logging preprocessing parameters...
2025-04-26 16:17:04,187 - INFO - Initializing Logistic Regression Model
2025-04-26 16:17:04,188 - INFO - Fitting the model
2025-04-26 16:17:05,245 - INFO - Model training complete.
2025-04-26 16:17:05,246 - INFO - Logging model paramaters...
2025-04-26 16:17:05,403 - INFO - Making predictions...
2025-04-26 16:17:05,407 - INFO - Calculating evaluation metrics
2025-04-26 16:17:05,420 - INFO - Logging evaluation metrics
2025-04-26 16:17:06,103 - INFO - Saving and logging the model...
2025-04-26 16:17:13,970 - INFO - Model training and logging completed in 10.30 seconds
2025-04-26 16:17:13,971 - INFO - Accuracy: 0.855803166280125
2025-04-26 16:17:13,971 - INFO - Precision: 0.845539540319439
2025-04-26 16:17:13,972 - INFO - Recall: 0.8720369626355966
2025-04-26 16:17:13,973 - INFO - F1 Score: 0.8585838607594937


🏃 View run funny-tern-695 at: https://dagshub.com/AmmanSajid1/End-to-End-Movie-Review-Sentiment-Analysis.mlflow/#/experiments/0/runs/daba143924ea45b9aa02b3c4cb4b5db0
🧪 View experiment at: https://dagshub.com/AmmanSajid1/End-to-End-Movie-Review-Sentiment-Analysis.mlflow/#/experiments/0
