In [42]:
import os
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "akatoshleiwu"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "828c62a7c6fa418a0cc0a545305fd26075c66053"

In [43]:
import os
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow'

In [5]:
os.chdir("../")
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow'

In [44]:
# set up entity

from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    all_params: dict
    metrics_file_name: Path
    TARGET_COLUMN: str
    mlflow_uri: str

In [45]:
# set up configuration manager
from src.datascienceproject.constant import *  # Import all constants
from src.datascienceproject.utils.common import read_yaml, create_directories,save_json  # Import specific utility functions

In [46]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        # Convert paths to strings for read_yaml
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config=self.config.model_evaluation
        params=self.params.ElasticNet
        schema=self.schema.TARGET_COLUMN

        create_directories([config.root_dir])
        
        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            model_path=config.model_path,
            all_params=params,
            metrics_file_name=config.metrics_file_name,
            TARGET_COLUMN=schema.name,
            mlflow_uri="https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow"
        )
        return model_evaluation_config

In [47]:
# set up evaluation matrics
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import numpy as np
import joblib

In [48]:
# Check the transformed test data
test_data = pd.read_csv(Path("artifacts/data_transformation/test.csv"), sep=';')
print("Columns in test data:")
print(test_data.columns.tolist())
print("\nFirst few rows:")
print(test_data.head())

Columns in test data:
['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"']

First few rows:
  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0  60.290.4110.80.048551490.99373.090.5910.966666...                                                                                                                     
1        5.40.530.162.70.036341280.988563.20.5313.28                                                                                                                     
2        7.10.250.392.10.036301240.99083.280.4312.28                                                                                                                     
3       7.30.280.351.60.054311480.991783.180.4710.75                                                         

In [60]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def _preprocess_data(self, file_path):
        # Read the raw data
        with open(file_path, 'r') as f:
            lines = f.readlines()
        
        # Get column names from first line, removing extra quotes
        headers = lines[0].strip().strip('"').split(';')
        headers = [h.strip('"') for h in headers]
        
        # Process data rows
        data_rows = []
        for line in lines[1:]:
            values = line.strip().split(';')
            data_rows.append(values)
            
        # Create DataFrame
        df = pd.DataFrame(data_rows, columns=headers)
        
        # Convert numeric columns to appropriate types
        for col in df.columns:
            if col != 'quality':  # All columns except quality should be float
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
        
        print(f"Processed data columns: {df.columns.tolist()}")
        print(f"First few rows of processed data:\n{df.head()}")
        
        return df

    def eval_metrics(self,actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2
    
    def log_into_mlflow(self):
        # Preprocess the test data
        test_data = self._preprocess_data(self.config.test_data_path)
        model = joblib.load(self.config.model_path)

        test_x = test_data.drop([self.config.TARGET_COLUMN], axis=1)
        test_y = test_data[[self.config.TARGET_COLUMN]]


        # Set tracking URI but don't set registry URI for DagsHub
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Start an MLflow run
        with mlflow.start_run():
            # Make predictions
            predicted_qualities = model.predict(test_x)
            
            # Calculate metrics
            (rmse, mae, r2) = self.eval_metrics(test_y, predicted_qualities)
            
            # Save metrics locally
            from anyio import Path as AnyioPath
            scores = {"rmse": rmse, "mae": mae, "r2": r2}
            save_json(path=AnyioPath(str(self.config.metrics_file_name)), data=scores)

            # Log parameters and metrics to MLflow
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae", mae)

            # Log the model without registration
            try:
                # Log model artifacts
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="model",
                )
                print(f"Model logged successfully in run {mlflow.active_run().info.run_id}")
            except Exception as e:
                print(f"Warning: Error while logging model: {str(e)}")
                print("Continuing with metric logging...")
    


In [61]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.log_into_mlflow()
except Exception as e:
    raise e

[2025-08-28 07:57:14,758] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\configs\config.yaml loaded successfully.
[2025-08-28 07:57:14,762] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\params.yaml loaded successfully.
[2025-08-28 07:57:14,768] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\schema.yaml loaded successfully.
[2025-08-28 07:57:14,771] INFO in common: created directory at: artifacts
[2025-08-28 07:57:14,773] INFO in common: created directory at: artifacts/model_evaluation
[2025-08-28 07:57:14,762] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\params.yaml loaded successfully.
[2025-08-28 07:57:14,768] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\schema.yaml loaded successfully.
[2025-08-28 07:57:14,771] INFO in common: created directory at: artifacts
[2025-08-28 07:57:14,773] INFO in common: c



Continuing with metric logging...
🏃 View run adventurous-robin-17 at: https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow/#/experiments/0/runs/d9380992b7ac488fb728e2b6ad180fb1
🧪 View experiment at: https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow/#/experiments/0
🏃 View run adventurous-robin-17 at: https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow/#/experiments/0/runs/d9380992b7ac488fb728e2b6ad180fb1
🧪 View experiment at: https://dagshub.com/akatoshleiwu/datascienceproject_fullflow.mlflow/#/experiments/0


In [22]:
# Debug cell - Check test data
import pandas as pd
from pathlib import Path

# First check if the file exists
test_data_path = Path("artifacts/data_transformation/test.csv")
print(f"Checking file at: {test_data_path.absolute()}")
print(f"File exists: {test_data_path.exists()}\n")

try:
    # Try reading with explicit encoding
    test_data = pd.read_csv(test_data_path, sep=';', encoding='utf-8')
    
    print("Test data info:")
    print(test_data.info())
    
    print("\nTest data columns:")
    print(test_data.columns.tolist())
    
    print("\nFirst few rows:")
    print(test_data.head())
    
    # Print raw content of first few lines to check for formatting issues
    print("\nRaw content of first few lines:")
    with open(test_data_path, 'r', encoding='utf-8') as f:
        print(f.readline().strip())  # header line
        print(f.readline().strip())  # first data line
    
    # Print target column name for verification
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    print("\nTarget column from config:")
    print(f"Original: '{model_evaluation_config.TARGET_COLUMN}'")
    stripped_target = model_evaluation_config.TARGET_COLUMN.strip().strip('"')
    print(f"Stripped: '{stripped_target}'")

except Exception as e:
    print(f"Error reading file: {str(e)}")
    
    # If there was an error, try to read the raw file content
    try:
        print("\nAttempting to read raw file content:")
        with open(test_data_path, 'r', encoding='utf-8') as f:
            print(f.read(500))  # Read first 500 characters
    except Exception as e2:
        print(f"Error reading raw file: {str(e2)}")

Checking file at: c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\artifacts\data_transformation\test.csv
File exists: True

Test data info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 980 entries, (np.float64(6.0), np.float64(0.29), np.float64(0.41), np.float64(10.8), np.float64(0.048), np.float64(55.0), np.float64(149.0), np.float64(0.9937), np.float64(3.09), np.float64(0.59), np.float64(10.9666666666667)) to (np.float64(6.7), np.float64(0.22), np.float64(0.39), np.float64(1.2), np.float64(0.049), np.float64(26.0), np.float64(152.0), np.float64(0.99346), np.float64(3.5), np.float64(0.47), np.float64(10.0))
Data columns (total 1 columns):
 #   Column                                                                                                                                                                   Non-Null Count  Dtype
---  ------                                                                                                                                   