In [None]:
cd ..

In [None]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import logging

import numpy as np
import mlflow
import dagshub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from text_prettifier import TextPrettifier
import joblib

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")

In [None]:
logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s -  %(message)s")
config={
    'data_path':'notebooks/IMDB.csv',
    'test_size':0.2
    
}

In [37]:
prettifier = TextPrettifier()

def text_cleaner(text):
    text = text.lower()
    text = prettifier.remove_contractions(text)
    text = prettifier.remove_emojis(text)
    text = prettifier.remove_html_tags(text)
    text = prettifier.remove_urls(text)
    text = prettifier.remove_special_chars(text)
    text = prettifier.remove_stopwords(text)
    text = prettifier.remove_numbers(text)
    return text

In [38]:
def load_and_transformation(filepath:str):
    try:
        logging.info("loaded the file")
        df = pd.read_csv(filepath)
        logging.info("cleaning the reviews in data")
        df['review'] = df['review'].apply(text_cleaner)
        logging.info("mapping sentiment to 1 and 0")
        df['sentiment'] = df['sentiment'].map({'negative':0,'positive':1})
        return df
    except Exception as e:
        logging.info(f'Error while transformaing the df {e}')

In [39]:
df = load_and_transformation(config['data_path'])

2025-07-09 07:43:02,879 - INFO -  loaded the file
2025-07-09 07:43:02,898 - INFO -  cleaning the reviews in data
2025-07-09 07:43:04,691 - INFO -  mapping sentiment to 1 and 0


In [40]:
mlflow.set_tracking_uri('https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow')
dagshub.init(repo_owner='AyushAI14', repo_name='Mlops-Capstone-Project', mlflow=True)

mlflow.set_experiment('bow vs tfidf')


2025-07-09 07:43:05,524 - INFO -  HTTP Request: GET https://dagshub.com/api/v1/repos/AyushAI14/Mlops-Capstone-Project "HTTP/1.1 200 OK"


2025-07-09 07:43:05,538 - INFO -  Initialized MLflow to track repo "AyushAI14/Mlops-Capstone-Project"


2025-07-09 07:43:05,544 - INFO -  Repository AyushAI14/Mlops-Capstone-Project initialized!


<Experiment: artifact_location='mlflow-artifacts:/69745e1a3e134795a8b205d80021c5cc', creation_time=1752026929556, experiment_id='1', last_update_time=1752026929556, lifecycle_stage='active', name='bow vs tfidf', tags={}>

In [41]:
VECTORIZERS = {
    'BoW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

ALGORITHMS = {
    'LogisticRegression': LogisticRegression(),
    'MultinomialNB': MultinomialNB(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [42]:
def log_model_params(algo_name, model):
    """Logs hyperparameters of the trained model to MLflow."""
    params_to_log = {}
    if algo_name == 'LogisticRegression':
        params_to_log["C"] = model.C
    elif algo_name == 'MultinomialNB':
        params_to_log["alpha"] = model.alpha
    elif algo_name == 'XGBoost':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["learning_rate"] = model.learning_rate
    elif algo_name == 'RandomForest':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["max_depth"] = model.max_depth
    elif algo_name == 'GradientBoosting':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["learning_rate"] = model.learning_rate
        params_to_log["max_depth"] = model.max_depth

    mlflow.log_params(params_to_log)

In [None]:
def train_test(df):
    with mlflow.start_run(run_name='ALL EXPERIMENT') as parent:
        for vector_name,vector_value in VECTORIZERS.items():
            for algo_name,algo_value in ALGORITHMS.items():
                with mlflow.start_run(run_name=f"{algo_name} with {vector_name}", nested=True) as child:
                    try:
                        logging.info('dataset splitting in test and train')
                        x = vector_value.fit_transform(df['review'])
                        y = df['sentiment']
                        x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=config["test_size"],random_state=44)
                        mlflow.log_params({
                                "vectorizer": vector_name,
                                "algorithm": algo_name,
                                "test_size": config["test_size"]
                            })
                        
                        logging.info('model fitting starts')
                        algo_value.fit(x_train,y_train)
                        log_model_params(algo_name, algo_value)

                        logging.info('Evaluating the model')
                        y_pred = algo_value.predict(x_test)
                        metrics = {
                                "accuracy": accuracy_score(y_test, y_pred),
                                "precision": precision_score(y_test, y_pred),
                                "recall": recall_score(y_test, y_pred),
                                "f1_score": f1_score(y_test, y_pred)
                            }
                        logging.info("Saving and logging the model and metrics...")
                        model_path = f"models/{algo_name}_{vector_name}_model.pkl"
                        joblib.dump(algo_value, model_path)
                        mlflow.log_artifact(model_path)

                        mlflow.log_metrics(metrics)
                        print(f"\nAlgorithm: {algo_name}, Vectorizer: {vector_name}")
                        print(f"Metrics: {metrics}")
                    except Exception as e:
                        logging.info(f'error while training {e}')
                        


In [44]:
train_test(df)

2025-07-09 07:43:07,533 - INFO -  dataset splitting in test and train
2025-07-09 07:43:08,126 - INFO -  model fitting starts
2025-07-09 07:43:10,952 - INFO -  Evaluating the model
2025-07-09 07:43:11,007 - INFO -  Saving and logging the model and metrics...



Algorithm: LogisticRegression, Vectorizer: BoW
Metrics: {'accuracy': 0.775, 'precision': 0.7252747252747253, 'recall': 0.7674418604651163, 'f1_score': 0.7457627118644068}
üèÉ View run LogisticRegression with BoW at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/3b2235fd9f1c4274a4687cfee7927868
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:43:16,889 - INFO -  dataset splitting in test and train
2025-07-09 07:43:18,316 - INFO -  model fitting starts
2025-07-09 07:43:18,828 - INFO -  Evaluating the model
2025-07-09 07:43:18,848 - INFO -  Saving and logging the model and metrics...



Algorithm: MultinomialNB, Vectorizer: BoW
Metrics: {'accuracy': 0.785, 'precision': 0.7792207792207793, 'recall': 0.6976744186046512, 'f1_score': 0.7361963190184049}
üèÉ View run MultinomialNB with BoW at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/b1af99c543a045cea68fc313c169bdf1
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:43:25,780 - INFO -  dataset splitting in test and train
2025-07-09 07:43:27,231 - INFO -  model fitting starts
2025-07-09 07:43:29,069 - INFO -  Evaluating the model
2025-07-09 07:43:29,092 - INFO -  Saving and logging the model and metrics...



Algorithm: XGBoost, Vectorizer: BoW
Metrics: {'accuracy': 0.73, 'precision': 0.6739130434782609, 'recall': 0.7209302325581395, 'f1_score': 0.6966292134831461}
üèÉ View run XGBoost with BoW at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/dc96154a7a6b4fc6ad5008121ddc08c3
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:43:36,550 - INFO -  dataset splitting in test and train
2025-07-09 07:43:37,470 - INFO -  model fitting starts
2025-07-09 07:43:39,417 - INFO -  Evaluating the model
2025-07-09 07:43:39,460 - INFO -  Saving and logging the model and metrics...



Algorithm: RandomForest, Vectorizer: BoW
Metrics: {'accuracy': 0.755, 'precision': 0.7032967032967034, 'recall': 0.7441860465116279, 'f1_score': 0.7231638418079096}
üèÉ View run RandomForest with BoW at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/e9bce8c18e7941d49fb589d7b30937de
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:43:49,031 - INFO -  dataset splitting in test and train
2025-07-09 07:43:50,271 - INFO -  model fitting starts
2025-07-09 07:43:55,082 - INFO -  Evaluating the model
2025-07-09 07:43:55,109 - INFO -  Saving and logging the model and metrics...



Algorithm: GradientBoosting, Vectorizer: BoW
Metrics: {'accuracy': 0.805, 'precision': 0.7373737373737373, 'recall': 0.8488372093023255, 'f1_score': 0.7891891891891892}
üèÉ View run GradientBoosting with BoW at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/093d41522ddb450c8698c3fd46c7198a
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:44:00,865 - INFO -  dataset splitting in test and train
2025-07-09 07:44:01,534 - INFO -  model fitting starts
2025-07-09 07:44:03,884 - INFO -  Evaluating the model
2025-07-09 07:44:03,917 - INFO -  Saving and logging the model and metrics...



Algorithm: LogisticRegression, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.765, 'precision': 0.7096774193548387, 'recall': 0.7674418604651163, 'f1_score': 0.7374301675977654}
üèÉ View run LogisticRegression with TF-IDF at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/fff670ca7d864c0f8c6d6c26677d4809
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:44:10,126 - INFO -  dataset splitting in test and train
2025-07-09 07:44:10,676 - INFO -  model fitting starts
2025-07-09 07:44:11,570 - INFO -  Evaluating the model
2025-07-09 07:44:11,597 - INFO -  Saving and logging the model and metrics...



Algorithm: MultinomialNB, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.765, 'precision': 0.7910447761194029, 'recall': 0.6162790697674418, 'f1_score': 0.6928104575163399}
üèÉ View run MultinomialNB with TF-IDF at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/cc750f6fbd754d94aa789c8522f806fe
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:44:18,021 - INFO -  dataset splitting in test and train
2025-07-09 07:44:19,245 - INFO -  model fitting starts
2025-07-09 07:44:23,652 - INFO -  Evaluating the model
2025-07-09 07:44:23,674 - INFO -  Saving and logging the model and metrics...



Algorithm: XGBoost, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.745, 'precision': 0.6923076923076923, 'recall': 0.7325581395348837, 'f1_score': 0.711864406779661}
üèÉ View run XGBoost with TF-IDF at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/292865b850a246ed823c8c71d36642e0
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:44:29,600 - INFO -  dataset splitting in test and train
2025-07-09 07:44:30,197 - INFO -  model fitting starts
2025-07-09 07:44:32,050 - INFO -  Evaluating the model
2025-07-09 07:44:32,083 - INFO -  Saving and logging the model and metrics...



Algorithm: RandomForest, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.745, 'precision': 0.6923076923076923, 'recall': 0.7325581395348837, 'f1_score': 0.711864406779661}
üèÉ View run RandomForest with TF-IDF at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/0c4e1c61b1004f3296da70a3b2f49448
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1


2025-07-09 07:44:42,599 - INFO -  dataset splitting in test and train
2025-07-09 07:44:43,624 - INFO -  model fitting starts
2025-07-09 07:44:50,792 - INFO -  Evaluating the model
2025-07-09 07:44:50,815 - INFO -  Saving and logging the model and metrics...



Algorithm: GradientBoosting, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.765, 'precision': 0.7010309278350515, 'recall': 0.7906976744186046, 'f1_score': 0.7431693989071039}
üèÉ View run GradientBoosting with TF-IDF at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/e275cae060fd4303b35961c24395f70b
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1
üèÉ View run ALL EXPERIMENT at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1/runs/821db3e2cf4e4a96bb4f6eff3c34a1db
üß™ View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/1
