In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,  recall_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns
import dagshub
import pandas as pd
import os
import shutil
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
import re

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'], axis=1)
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [3]:
df = df[df['sentiment'].isin(['happiness', 'sadness'])]
df.replace({'sentiment': {'happiness': 1, 'sadness': 0}}, inplace=True)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

  df.replace({'sentiment': {'happiness': 1, 'sadness': 0}}, inplace=True)


In [4]:
# Data Preprocessing
def lowercase_text(text: str) -> str:
    """
    Convert text to lowercase.
    """
    return str(text).lower()

def remove_urls(text: str) -> str:
    """
    Remove URLs from the text.
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_punctuation(text: str) -> str:
    """
    Remove punctuation from the text.
    """
    text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,ÿå-./:;<=>ÿü?@[\]^_`{|}~"""), ' ', str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_numbers(text: str) -> str:
    """
    Remove numbers from the text.
    """
    return re.sub(r'\d+', '', str(text))

def remove_stopwords(text: str) -> str:
    """
    Remove stopwords from the text.
    """
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text: str) -> str:
    """
    Lemmatize the text.
    """
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def fillna_with_empty(text: str) -> str:
    """
    Fill NaN values with empty strings.
    """
    if pd.isna(text):
        return ''
    return text

def preprocess_data(train_data: pd.DataFrame, test_data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Preprocess the text data in the training and testing datasets.
    """
    for dataset in [train_data, test_data]:
        dataset['content'] = dataset['content'].apply(fillna_with_empty)
        dataset['content'] = dataset['content'].apply(lowercase_text)
        dataset['content'] = dataset['content'].apply(remove_urls)
        dataset['content'] = dataset['content'].apply(remove_punctuation)
        dataset['content'] = dataset['content'].apply(remove_numbers)
        dataset['content'] = dataset['content'].apply(remove_stopwords)
        dataset['content'] = dataset['content'].apply(lemmatize_text)

    return train_data, test_data

train_data, test_data = preprocess_data(train_data, test_data)

  text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,ÿå-./:;<=>ÿü?@[\]^_`{|}~"""), ' ', str(text))


In [5]:

train_data['content'] = train_data['content'].fillna('').astype(str)
test_data['content'] = test_data['content'].fillna('').astype(str)

def vectorize_data_bow():
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train_data['content'])
    y_train = train_data['sentiment']

    X_test = vectorizer.transform(test_data['content'])
    y_test = test_data['sentiment']

    return X_train, y_train, X_test, y_test

def vectorize_data_tfidf():
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_data['content'])
    y_train = train_data['sentiment']

    X_test = vectorizer.transform(test_data['content'])
    y_test = test_data['sentiment']

    return X_train, y_train, X_test, y_test

X_train_bow, y_train_bow, X_test_bow, y_test_bow = vectorize_data_bow()
X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf = vectorize_data_tfidf()

In [6]:
# Setup Dagshub and MLflow

mlflow.set_tracking_uri("https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow")
dagshub.init(repo_owner='ChRaviTeja1901', repo_name='mlops-mini-project', mlflow=True)

mlflow.set_experiment("Baseline BoW vs TF-IDF with different models")

2025/12/25 22:19:32 INFO mlflow.tracking.fluent: Experiment with name 'Baseline BoW vs TF-IDF with different models' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/5c75573e57a14242b1f28e5e481c97a8', creation_time=1766681372846, experiment_id='2', last_update_time=1766681372846, lifecycle_stage='active', name='Baseline BoW vs TF-IDF with different models', tags={}>

In [8]:
with mlflow.start_run(run_name="Text_Classification_Experiment"):

    mlflow.log_param("test_size", 0.2)
    mlflow.log_param(
        "preprocessing_steps",
        "lowercase,remove_urls,remove_punctuation,remove_numbers,remove_stopwords,lemmatization"
    )

    vectorizers = [
        ("BoW", X_train_bow, y_train_bow, X_test_bow, y_test_bow),
        ("TFIDF", X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf)
    ]

    models = {
        "LogisticRegression": LogisticRegression(),
        "RandomForest": RandomForestClassifier(),
        "GradientBoosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "MultinomialNB": MultinomialNB()
    }

    for vec_name, X_train, y_train, X_test, y_test in vectorizers:
        for model_name, model in models.items():

            with mlflow.start_run(
                nested=True,
                run_name=f"{model_name}_{vec_name}"
            ):

                mlflow.log_param("vectorizer", vec_name)
                mlflow.log_param("model_type", model_name)

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Metrics
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)

                mlflow.log_metrics({
                    "accuracy": accuracy,
                    "precision": precision,
                    "recall": recall,
                    "f1_score": f1
                })

                # Confusion matrix
                cm = confusion_matrix(y_test, y_pred)
                plt.figure(figsize=(5, 4))
                sns.heatmap(
                    cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['sadness', 'happiness'],
                    yticklabels=['sadness', 'happiness']
                )
                plt.xlabel("Predicted")
                plt.ylabel("Actual")

                cm_path = f"confusion_matrix_{model_name}_{vec_name}.png"
                plt.savefig(cm_path)
                plt.close()

                mlflow.log_artifact(cm_path)

                mlflow.sklearn.log_model(model, artifact_path="model")




üèÉ View run LogisticRegression_BoW at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/975e23e9658b4f12837406446b2f4629
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run RandomForest_BoW at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/845a7d9f11a24834aca2e69fe96a6224
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run GradientBoosting_BoW at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/d90eed88500b44b9a8c03f5178249666
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


üèÉ View run XGBoost_BoW at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/d8813a6019564c898b160024b0a326c8
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run MultinomialNB_BoW at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/0802c3b2326b4533b227fe6dedca8969
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run LogisticRegression_TFIDF at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/510b9b306a9242a09dcdcac89ded13a7
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run RandomForest_TFIDF at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/4d788f790f5145bba820168a2e58f300
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run GradientBoosting_TFIDF at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/fd0e136b8e3c4367b1316fc2dcb9e6b8
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


üèÉ View run XGBoost_TFIDF at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/5935af300cfa4b73b02218f6b3d83aee
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2




üèÉ View run MultinomialNB_TFIDF at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/303d54821fb644a18c680dc62e9cc0c0
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2
üèÉ View run Text_Classification_Experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2/runs/30b656d1cfdd44e3b5f10241afdc3852
üß™ View experiment at: https://dagshub.com/ChRaviTeja1901/mlops-mini-project.mlflow/#/experiments/2
