In [None]:
cd ..

In [53]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from text_prettifier import TextPrettifier

In [None]:
df = pd.read_csv('notebooks/IMDB.csv')
df.shape

(1000, 2)

In [55]:
# text filteration

prettifier = TextPrettifier()

def text_cleaner(text):
    text = text.lower()
    text = prettifier.remove_contractions(text)
    text = prettifier.remove_emojis(text)
    text = prettifier.remove_html_tags(text)
    text = prettifier.remove_urls(text)
    text = prettifier.remove_special_chars(text)
    text = prettifier.remove_stopwords(text)
    text = prettifier.remove_numbers(text)
    return text

In [56]:
df['review'] = df['review'].apply(text_cleaner)

In [57]:
df['sentiment'].value_counts()

sentiment
negative    517
positive    483
Name: count, dtype: int64

In [58]:
df['sentiment'] = df['sentiment'].map({'negative':0,'positive':1})

In [59]:
df

Unnamed: 0,review,sentiment
0,film version sandra bernhards onewoman offbroa...,0
1,switched cable whim treated quite surprisealth...,1
2,plot film contains holes could drive massive t...,0
3,amusing humor falls flat decent acting quite a...,0
4,say movie terrible good two days earlier watch...,0
...,...,...
995,exactly new story line romantic comedy makes c...,0
996,first saw movie younger child sister told thou...,1
997,people stated th season south park started tre...,1
998,nothing directors juvenile fantasy come life m...,0


In [60]:
vectorizer=CountVectorizer(max_features=60)
x = vectorizer.fit_transform(df['review'])
y = df['sentiment']


In [61]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=44)

In [62]:
x_train.shape

(750, 60)

In [63]:
x_test.shape

(250, 60)

In [64]:
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow')
dagshub.init(repo_owner='AyushAI14', repo_name='Mlops-Capstone-Project', mlflow=True)

mlflow.set_experiment('LR Baseline')


2025-07-09 06:56:24,271 - INFO -  HTTP Request: GET https://dagshub.com/api/v1/repos/AyushAI14/Mlops-Capstone-Project "HTTP/1.1 200 OK"


2025-07-09 06:56:24,279 - INFO -  Initialized MLflow to track repo "AyushAI14/Mlops-Capstone-Project"


2025-07-09 06:56:24,282 - INFO -  Repository AyushAI14/Mlops-Capstone-Project initialized!


<Experiment: artifact_location='mlflow-artifacts:/45eadf9e1d694dc096a7e2102c9a47da', creation_time=1751986946865, experiment_id='0', last_update_time=1751986946865, lifecycle_stage='active', name='LR Baseline', tags={}>

In [None]:
import time 
import logging
import joblib
logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s -  %(message)s")

with mlflow.start_run():
    try:
        start_time = time.time()
        logging.info("Processing parameter")
        mlflow.log_param("Vectorizer","Bag of words")
        mlflow.log_param("Max_Feature",60)
        mlflow.log_param("test_size",0.25)


        logging.info("Model training starts")
        model = LogisticRegression(max_iter=1000)
        logging.info("Model starts fitting")
        model.fit(x_train,y_train)
        logging.info("Model training complete.")

        logging.info("Logging model parameters...")
        mlflow.log_param("model", "Logistic Regression")

        logging.info("Making predictions...")
        y_pred = model.predict(x_test)

        logging.info("Calculating evaluation metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        logging.info("Logging evaluation metrics...")

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model...")
        joblib.dump(model, "models/model_1.pkl")
        mlflow.log_artifact("models/model_1.pkl")


        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")



    except Exception as e:
        logging.info(f"Error in Model Training {e}")

2025-07-09 06:56:25,191 - INFO -  Processing parameter
2025-07-09 06:56:26,422 - INFO -  Model training starts
2025-07-09 06:56:26,424 - INFO -  Model starts fitting
2025-07-09 06:56:26,460 - INFO -  Model training complete.
2025-07-09 06:56:26,463 - INFO -  Logging model parameters...
2025-07-09 06:56:26,830 - INFO -  Making predictions...
2025-07-09 06:56:26,834 - INFO -  Calculating evaluation metrics...
2025-07-09 06:56:26,880 - INFO -  Logging evaluation metrics...
2025-07-09 06:56:32,665 - INFO -  Saving and logging the model...
2025-07-09 06:56:33,997 - INFO -  Model training and logging completed in 8.81 seconds.
2025-07-09 06:56:33,998 - INFO -  Accuracy: 0.664
2025-07-09 06:56:34,005 - INFO -  Precision: 0.6333333333333333
2025-07-09 06:56:34,009 - INFO -  Recall: 0.6551724137931034
2025-07-09 06:56:34,012 - INFO -  F1 Score: 0.6440677966101694


🏃 View run casual-loon-832 at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/0/runs/edde4845cbbf4ca0a272ab3d83b4dba2
🧪 View experiment at: https://dagshub.com/AyushAI14/Mlops-Capstone-Project.mlflow/#/experiments/0
