In [1]:
cd ..

/home/ayush/Documents/AI/Projects/Mlops-Capstone-Project


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import logging

import numpy as np
import mlflow
import dagshub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from text_prettifier import TextPrettifier
import joblib

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")

In [3]:
logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s -  %(message)s")
config={
    'data_path':'notebooks/IMDB.csv',
    'test_size':0.2
    
}

In [4]:
prettifier = TextPrettifier()

def text_cleaner(text):
    text = text.lower()
    text = prettifier.remove_contractions(text)
    text = prettifier.remove_emojis(text)
    text = prettifier.remove_html_tags(text)
    text = prettifier.remove_urls(text)
    text = prettifier.remove_special_chars(text)
    text = prettifier.remove_stopwords(text)
    text = prettifier.remove_numbers(text)
    return text

In [5]:
def load_and_transformation(filepath:str):
    try:
        logging.info("loaded the file")
        df = pd.read_csv(filepath)
        logging.info("cleaning the reviews in data")
        df['review'] = df['review'].apply(text_cleaner)
        logging.info("mapping sentiment to 1 and 0")
        df['sentiment'] = df['sentiment'].map({'negative':0,'positive':1})
        return df
    except Exception as e:
        logging.info(f'Error while transformaing the df {e}')

In [6]:
df = load_and_transformation(config['data_path'])

2025-07-09 08:31:35,738 - INFO -  loaded the file
2025-07-09 08:31:35,790 - INFO -  cleaning the reviews in data
2025-07-09 08:31:37,508 - INFO -  mapping sentiment to 1 and 0


In [7]:
mlflow.set_tracking_uri('https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow')
dagshub.init(repo_owner='AyushAI14', repo_name='Mlops-Capstone-Project', mlflow=True)

mlflow.set_experiment('lr and bow')


2025-07-09 08:31:38,285 - INFO -  HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


2025-07-09 08:31:38,304 - INFO -  Accessing as AyushAI14
2025-07-09 08:31:38,794 - INFO -  HTTP Request: GET https://dagshub.com/api/v1/repos/AyushAI14/Mlops-Capstone-Project "HTTP/1.1 200 OK"
2025-07-09 08:31:39,204 - INFO -  HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


2025-07-09 08:31:39,212 - INFO -  Initialized MLflow to track repo "AyushAI14/Mlops-Capstone-Project"


2025-07-09 08:31:39,218 - INFO -  Repository AyushAI14/Mlops-Capstone-Project initialized!


<Experiment: artifact_location='mlflow-artifacts:/7f7ad6d22db247f4b3a77af166ca5d13', creation_time=1752028144180, experiment_id='2', last_update_time=1752028144180, lifecycle_stage='active', name='lr and bow', tags={}>

In [8]:
VECTORIZERS = {
    'BoW': CountVectorizer()
}

ALGORITHMS = {
    'LogisticRegression': LogisticRegression()
}

In [9]:
x =VECTORIZERS['BoW'].fit_transform(df['review'])

y = df['sentiment']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=config["test_size"],random_state=44)

In [10]:
def train_and_log_model(X_train, X_test, y_train, y_test,):
    """Trains a Logistic Regression model with GridSearch and logs results to MLflow."""
    
    param_grid = {
        "C": [0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"]
    }
    
    with mlflow.start_run():
        grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="f1", n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Log all hyperparameter tuning runs
        for params, mean_score, std_score in zip(grid_search.cv_results_["params"], 
                                                 grid_search.cv_results_["mean_test_score"], 
                                                 grid_search.cv_results_["std_test_score"]):
            with mlflow.start_run(run_name=f"LR with params: {params}", nested=True):
                model = LogisticRegression(**params)
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)
                
                metrics = {
                    "accuracy": accuracy_score(y_test, y_pred),
                    "precision": precision_score(y_test, y_pred),
                    "recall": recall_score(y_test, y_pred),
                    "f1_score": f1_score(y_test, y_pred),
                    "mean_cv_score": mean_score,
                    "std_cv_score": std_score
                }
                
                # Log parameters & metrics
                mlflow.log_params(params)
                mlflow.log_metrics(metrics)
                
                print(f"Params: {params} | Accuracy: {metrics['accuracy']:.4f} | F1: {metrics['f1_score']:.4f}")

        # Log the best model
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        best_f1 = grid_search.best_score_

        mlflow.log_params(best_params)
        mlflow.log_metric("best_f1_score", best_f1)
        logging.info("Saving and logging the model...")
        joblib.dump(best_model,"models/model3.pkl")
        mlflow.log_artifact("models/model3.pkl")
        
        print(f"\nBest Params: {best_params} | Best F1 Score: {best_f1:.4f}")


In [11]:
train_and_log_model(x_train,x_test,y_train,y_test)

Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.7250 | F1: 0.7027
🏃 View run LR with params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2/runs/13287ed31e9f4d829ed75c257b627381
🧪 View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2
Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} | Accuracy: 0.7750 | F1: 0.7486
🏃 View run LR with params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2/runs/dcc3e89ab8c042a6a099c0e030b844e5
🧪 View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2
Params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.7450 | F1: 0.7273
🏃 View run LR with params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#

2025-07-09 08:32:24,877 - INFO -  Saving and logging the model...



Best Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} | Best F1 Score: 0.7969
🏃 View run respected-turtle-194 at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2/runs/e9605785a8f746068d4c1cab3963dfad
🧪 View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/2
