In [1]:
import os

os.chdir("../")


In [2]:
%pwd

'c:\\Users\\ajay\\Desktop\\myPortfolio\\CommentAnalysis'

In [3]:

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class EvaluationConfig:
    path_of_model: Path
    x_train_file_path: Path
    y_train_file_path:Path
    testing_data: Path
    mlflow_uri: str
    root_dir: Path
    transformer:Path
    file_path:Path
    params_file_path:Path


In [4]:
from src.CommentAnalysis.constants import *
from src.CommentAnalysis.utils.common import read_yaml, create_directories, save_json
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = params_filepath
        create_directories([self.config.artifacts_root])

    
    def get_evaluation_config(self) -> EvaluationConfig:
        config=self.config.Model_Evaluation
        Modelconfig = self.config.prepare_model
        dataconfg=self.config.data_Transformation
        create_directories([config.root_dir])

        eval_config = EvaluationConfig(
            path_of_model=Modelconfig.trained_model_path,
            x_train_file_path=dataconfg.x_train_file_path,
            y_train_file_path=dataconfg.y_train_file_path,
            testing_data=dataconfg.transform_test_file,
            mlflow_uri="https://dagshub.com/AIwithAj/CommentAnalysis.mlflow",
            params_file_path=self.params,
            root_dir=config.root_dir,
            transformer=dataconfg.transformer,
            file_path=config.file_path
        )
        return eval_config

In [None]:
import os
import json
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics import classification_report, confusion_matrix
from mlflow.models import infer_signature
import mlflow
import mlflow.sklearn
import dagshub

from src.CommentAnalysis.utils.common import load_vectorizer, load_bin, read_data
from src.CommentAnalysis import logger


class Evaluation:
    def __init__(self, config):
        """
        Initialize the Evaluation class with config.
        """
        self.config = config

    def save_model_info(self, run_id: str, model_path: str, file_path: str, metrics: dict = None) -> None:
        """
        Save the model run ID, path and optionally metrics to a JSON file.
        """
        model_info = {'run_id': run_id, 'model_path': model_path}
        if metrics:
            model_info['metrics'] = metrics

        with open(file_path, 'w') as file:
            json.dump(model_info, file, indent=4)

        logger.info(f'Model info and metrics saved to {file_path}')

    def evaluate_model(self, model, X, y):
        """
        Evaluate the model and return classification report & confusion matrix.
        """
        y_pred = model.predict(X)
        report = classification_report(y, y_pred, output_dict=True)
        cm = confusion_matrix(y, y_pred)
        logger.info('Model evaluation completed')
        return report, cm

    def log_confusion_matrix(self, cm, dataset_name: str):
        """
        Log confusion matrix as an MLflow artifact.
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix: {dataset_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        cm_file_path = self.config.root_dir+f'/confusion_matrix_{dataset_name}.png'
        plt.savefig(cm_file_path)
        mlflow.log_artifact(cm_file_path)
        plt.close()
        logger.info(f'Confusion matrix logged for {dataset_name}')

    def log_metrics(self, report, prefix) -> dict:
        """
        Log metrics from classification report with a prefix.
        Returns a dict of the main metrics for JSON file.
        """
        metrics_summary = {}
        for label, metrics in report.items():
            if label == 'accuracy':
                # accuracy is a float
                mlflow.log_metric(f"{prefix}_accuracy", metrics)
                metrics_summary[f"{prefix}_accuracy"] = metrics
            elif isinstance(metrics, dict):
                mlflow.log_metrics({
                    f"{prefix}_{label}_precision": metrics['precision'],
                    f"{prefix}_{label}_recall": metrics['recall'],
                    f"{prefix}_{label}_f1-score": metrics['f1-score']
                })
                if label == 'weighted avg':
                    metrics_summary[f"{prefix}_precision"] = metrics['precision']
                    metrics_summary[f"{prefix}_recall"] = metrics['recall']
                    metrics_summary[f"{prefix}_f1-score"] = metrics['f1-score']
        logger.info(f'Metrics logged for {prefix} -->{metrics_summary}')
        return metrics_summary


    def log_params_from_yaml(self, yaml_path):
        """
        Load params from params.yaml and log to MLflow.
        """
        with open(yaml_path, 'r') as f:
            params = yaml.safe_load(f)
        for key, value in params.items():
            if isinstance(value, dict):
                for subkey, subval in value.items():
                    mlflow.log_param(f"{key}_{subkey}", subval)
            else:
                mlflow.log_param(key, value)
        logger.info(f'Parameters from {yaml_path} logged to MLflow')

    def initiate_model_evaluation(self):
        """
        Main function to run model evaluation and log to MLflow.
        """
        dagshub.init(
            repo_owner="AIwithAj",
            repo_name="CommentAnalysis",
            mlflow=True,
        )
        mlflow.set_experiment('dvc-pipeline-runs')

        try:
            with mlflow.start_run() as run:
                logger.info('MLflow run started.')

                # Load artifacts
                model = load_bin(self.config.path_of_model)
                vectorizer = load_vectorizer(self.config.transformer)
                logger.info('Model and vectorizer loaded.')

                X_train_tfidf = sparse.load_npz(self.config.x_train_file_path)
                y_train = read_data(self.config.y_train_file_path)

                test_data = read_data(self.config.testing_data).dropna(subset=['clean_comment', 'category'])
                X_test_tfidf = vectorizer.transform(test_data['clean_comment'].values)
                y_test = test_data['category'].values

                # Signature & example
                input_example = pd.DataFrame(
                    X_test_tfidf.toarray()[:5],
                    columns=vectorizer.get_feature_names_out()
                )
                signature = infer_signature(input_example, model.predict(X_test_tfidf[:5]))
                logger.info('Input example & signature created.')

                # Log model
                mlflow.sklearn.log_model(
                    model,
                    "lgbm_model",
                    signature=signature,
                    input_example=input_example
                )
                logger.info('Model logged to MLflow.')

                # Log vectorizer & params
                mlflow.log_artifact(self.config.transformer)
                self.log_params_from_yaml(self.config.params_file_path)

                # Evaluate & log on Training Data
                train_report, train_cm = self.evaluate_model(model, X_train_tfidf, y_train)
                train_metrics = self.log_metrics(train_report, "train")
                self.log_confusion_matrix(train_cm, "Train Data")

                # Evaluate & log on Test Data
                test_report, test_cm = self.evaluate_model(model, X_test_tfidf, y_test)
                test_metrics = self.log_metrics(test_report, "test")
                self.log_confusion_matrix(test_cm, "Test Data")

                # Save model info with metrics
                model_path = "lgbm_model"
                metrics_summary = {
                    "train": train_metrics,
                    "test": test_metrics
                }
                self.save_model_info(run.info.run_id, model_path, self.config.file_path, metrics=metrics_summary)
                logger.info(f"loggig and saving evaluation metrics successfully--{ self.config.file_path}")
                # MLflow tags
                mlflow.set_tag("model_type", "LightGBM")
                mlflow.set_tag("task", "Sentiment Analysis")
                mlflow.set_tag("dataset", "YouTube Comments")

                logger.info("Model evaluation and logging completed successfully.")

        except Exception as e:
            logger.exception(f"Failed to complete model evaluation: {e}")
            print(f"Error: {e}")


In [12]:
try:
    config = ConfigurationManager()
    eval_config = config.get_evaluation_config()
    evaluation = Evaluation(eval_config)
    evaluation.initiate_model_evaluation()
    # evaluation.log_into_mlflow()

except Exception as e:
   raise e

[2025-07-10 21:43:31,180: INFO: common: YAML file loaded successfully: config\config.yaml]
[2025-07-10 21:43:31,180: INFO: common: Created directory: artifacts]
[2025-07-10 21:43:31,188: INFO: common: Created directory: artifacts/Model_Evaluation]


[2025-07-10 21:43:32,870: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/AIwithAj/CommentAnalysis "HTTP/1.1 200 OK"]


[2025-07-10 21:43:32,895: INFO: helpers: Initialized MLflow to track repo "AIwithAj/CommentAnalysis"]


[2025-07-10 21:43:32,900: INFO: helpers: Repository AIwithAj/CommentAnalysis initialized!]
[2025-07-10 21:43:34,224: INFO: 1966844431: MLflow run started.]
[2025-07-10 21:43:34,470: INFO: common: Binary file loaded: artifacts/prepare_base_model/trained_model.pkl]
[2025-07-10 21:43:34,636: INFO: 1966844431: Model and vectorizer loaded.]




[2025-07-10 21:43:37,487: INFO: 1966844431: Input example & signature created.]
[2025-07-10 21:45:06,193: INFO: 1966844431: Model logged to MLflow.]
[2025-07-10 21:45:10,978: INFO: 1966844431: Parameters from params.yaml logged to MLflow]




[2025-07-10 21:45:15,500: INFO: 1966844431: Model evaluation completed]
[2025-07-10 21:45:18,153: INFO: 1966844431: Metrics logged for train--->{'train_precision': 0.9356483185453919, 'train_recall': 0.9323282969344147, 'train_f1-score': 0.9324334295052213}]
[2025-07-10 21:45:19,114: INFO: 1966844431: Confusion matrix logged for Train Data]




[2025-07-10 21:45:20,407: INFO: 1966844431: Model evaluation completed]
[2025-07-10 21:45:23,116: INFO: 1966844431: Metrics logged for test--->{'test_precision': 0.8634689945082465, 'test_recall': 0.8624338624338624, 'test_f1-score': 0.8614695631272568}]
[2025-07-10 21:45:24,407: INFO: 1966844431: Confusion matrix logged for Test Data]
[2025-07-10 21:45:24,416: INFO: 1966844431: Model info and metrics saved to artifacts/Model_Evaluation/experiment_info.json]
[2025-07-10 21:45:24,425: INFO: 1966844431: loggig and saving evaluation metrics successfully--artifacts/Model_Evaluation/experiment_info.json]
[2025-07-10 21:45:25,976: INFO: 1966844431: Model evaluation and logging completed successfully.]


In [None]:
# pd.read_csv(r"C:\Users\ajay\Desktop\myPortfolio\CommentAnalysis\artifacts\data_Transformation\test\transform_test_file.csv").isnull().sum()