In [1]:
import os
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow'

In [22]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    target_column: str

In [4]:
# Import constants and utility functions from the project
# Code transfered to src.datascienceproject.config.configuration.py
from src.datascienceproject.constant import *  # Import all constants
from src.datascienceproject.utils.common import read_yaml, create_directories  # Import specific utility functions

In [23]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        # Convert paths to strings for read_yaml
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_training
        params = self.params.ElasticNet
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(config.train_data_path),
            test_data_path=Path(config.test_data_path),
            model_name=config.model_name,
            alpha=params.alpha,
            l1_ratio=params.l1_ratio,
            target_column=schema.name
        )

        return model_trainer_config

In [24]:
import pandas as pd
import os
from src.datascienceproject import logger
from sklearn.linear_model import ElasticNet
import joblib

In [30]:
# Check the training data columns
train_data = pd.read_csv(Path("artifacts/data_transformation/train.csv"), sep=";")
print("Training data columns:")
print(train_data.columns.tolist())

# Verify that the quality column exists and its data type
if 'quality' in train_data.columns:
    print("\nQuality column data type:", train_data['quality'].dtype)
    print("First few values of quality column:")
    print(train_data['quality'].head())

Training data columns:
['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"']


In [33]:
# Debug: Read and inspect the raw data
with open(Path("artifacts/data_transformation/train.csv"), 'r') as f:
    print("First few lines of raw data:")
    for i, line in enumerate(f):
        if i < 5:  # Print first 5 lines
            print(f"Line {i+1}: {line.strip()}")
            
print("\nTrying to read with different separators:")
# Try reading with comma
df_comma = pd.read_csv(Path("artifacts/data_transformation/train.csv"))
print("\nColumns with comma separator:", df_comma.columns.tolist())

# Try reading the first few rows as plain text
print("\nShape of data:", df_comma.shape)
print("\nFirst row of data:")
print(df_comma.iloc[0])

First few lines of raw data:
Line 1: "fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
Line 2: 7.3;0.17;0.36;8.2;0.028;44;111;0.99272;3.14;0.41;12.4;6
Line 3: 6.3;0.25;0.44;11.6;0.041;48;195;0.9968;3.18;0.52;9.5;5
Line 4: 5.6;0.32;0.33;7.4;0.037;25;95;0.99268;3.25;0.49;11.1;6
Line 5: 6.9;0.19;0.35;1.7;0.036;33;101;0.99315;3.21;0.54;10.8;7

Trying to read with different separators:

Columns with comma separator: ['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"']

Shape of data: (3918, 1)

First row of data:
fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"    7.3;0.17;0.36;8.2;0.028;44;111;0.99272;3.14;0....
Name

In [34]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def _preprocess_data(self, file_path):
        # Read the raw data
        with open(file_path, 'r') as f:
            lines = f.readlines()
        
        # Get column names from first line, removing extra quotes
        headers = lines[0].strip().strip('"').split(';')
        headers = [h.strip('"') for h in headers]
        
        # Process data rows
        data_rows = []
        for line in lines[1:]:
            values = line.strip().split(';')
            data_rows.append(values)
            
        # Create DataFrame
        df = pd.DataFrame(data_rows, columns=headers)
        
        # Convert numeric columns to appropriate types
        for col in df.columns:
            if col != 'quality':  # All columns except quality should be float
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
                
        return df

    def train(self):
        # Load and preprocess data
        train_data = self._preprocess_data(self.config.train_data_path)
        test_data = self._preprocess_data(self.config.test_data_path)
        
        print("Available columns in training data:")
        print(train_data.columns.tolist())
        print("\nTarget column we're looking for:", self.config.target_column)
        
        # Add separator for readability
        print("\nFirst few rows of training data:")
        print(train_data.head())
        
        train_X = train_data.drop(columns=[self.config.target_column], axis=1)
        test_X = test_data.drop(columns=[self.config.target_column], axis=1)
        train_y = train_data[self.config.target_column]
        test_y = test_data[self.config.target_column]

        lr = ElasticNet(alpha=self.config.alpha, l1_ratio=self.config.l1_ratio, random_state=42)
        lr.fit(train_X, train_y)

        joblib.dump(lr, os.path.join(self.config.root_dir, self.config.model_name))
        
        logger.info("Model training completed.")

In [35]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2025-08-27 01:17:59,815] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\configs\config.yaml loaded successfully.
[2025-08-27 01:17:59,824] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\params.yaml loaded successfully.
[2025-08-27 01:17:59,824] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\params.yaml loaded successfully.
[2025-08-27 01:17:59,835] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\schema.yaml loaded successfully.
[2025-08-27 01:17:59,835] INFO in common: YAML file c:\Users\akato\Desktop\MLOps\datascienceproject_fullflow\schema.yaml loaded successfully.


[2025-08-27 01:17:59,838] INFO in common: created directory at: artifacts
[2025-08-27 01:17:59,842] INFO in common: created directory at: artifacts/model_training
[2025-08-27 01:17:59,842] INFO in common: created directory at: artifacts/model_training
Available columns in training data:
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

Target column we're looking for: quality

First few rows of training data:
   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.3              0.17         0.36  ...       0.41     12.4        6
1            6.3              0.25         0.44  ...       0.52      9.5        5
2            5.6              0.32         0.33  ...       0.49     11.1        6
3            6.9              0.19         0.35  ...       0.54     10.8        7
4            7.7              0.30         