In [36]:
import os
import pickle
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [26]:
mlflow.set_experiment("Wine_Prediction")

2025/03/12 21:44:15 INFO mlflow.tracking.fluent: Experiment with name 'Wine_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/jagad/OneDrive/Desktop/MLP/mlruns/951629547521299073', creation_time=1741796055755, experiment_id='951629547521299073', last_update_time=1741796055755, lifecycle_stage='active', name='Wine_Prediction', tags={}>

In [2]:
os.chdir("../")

In [None]:
from src.logs import log_message

In [5]:
train_data = pd.read_csv('artifacts/train.csv')
test_data = pd.read_csv('artifacts/test.csv')

In [6]:
train_data.shape

(1019, 12)

In [7]:
test_data.shape

(340, 12)

In [11]:
X_train = train_data.drop('quality',axis=1)
X_test = test_data.drop('quality',axis=1)

In [12]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.793825,0.181492,0.193089,0.783591,0.415423,-0.072545,-0.003004,0.664157,-1.326576,1.966013,-0.400592
1,0.43188,0.068623,1.114715,0.948245,0.686,2.567313,2.070433,0.831112,-1.526674,-0.290954,-1.148078
2,0.552528,-1.229372,0.449096,-0.204333,-0.125732,-0.478677,-0.488965,-0.309744,0.14081,1.529181,1.468125
3,0.250907,-0.608592,0.909909,0.124975,-0.50454,1.653516,0.418163,-0.298614,0.14081,0.873932,1.468125
4,1.035122,-1.172938,1.626729,-0.204333,-0.233963,-1.290941,-1.007325,1.498929,-0.726282,0.946738,-0.774335


In [13]:
X_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.653957,-0.83433,0.193089,-0.533642,-1.370388,-0.072545,-0.55376,-1.105561,-0.526184,0.801127,0.814074
1,0.250907,0.068623,-0.062919,0.289629,0.848347,-0.884809,-0.488965,0.497203,-0.259386,-0.363759,-1.054643
2,-0.593632,-0.21355,-0.882142,0.454283,-0.721002,-1.189408,-1.039722,-0.604697,-0.059288,-1.310229,0.066588
3,-0.41266,2.38244,-1.240552,-0.533642,0.469538,-0.884809,-0.845337,-0.448873,-0.726282,-0.654981,-0.774335
4,-0.533308,0.012188,-0.779739,-0.698296,2.228291,-1.189408,-1.104517,0.174424,-0.326086,1.529181,-1.148078


In [14]:
y_train = train_data['quality']
y_test = test_data['quality']

In [37]:
models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
        "LinearRegression": LinearRegression(),
        "SVC": SVR(),
        "ElasticNet":ElasticNet(alpha=0.5,l1_ratio=0.7,random_state=101)
}

In [None]:
best_model = None
best_score = float('-inf')  # Higher R² is better

log_message("Training models...")

for name, model in models.items():
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)  # R² Score for model selection

        log_message(f"{name} -> MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
        mlflow.log_param("model", name)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        mlflow.sklearn.log_model(model, "model")

        if r2 > best_score:  # Choose model with highest R²
            best_model = model
            best_score = r2



In [39]:
best_model

In [40]:
best_score

0.3892477078883031

In [41]:
def training(test,train):
    
    log_message("Loading the train and test data.....")

    train_data = test
    test_data = train

    log_message("Splitting the data.....")

    X_train = train_data.drop('quality',axis=1)
    X_test = test_data.drop('quality',axis=1)
    y_train = train_data['quality']
    y_test = test_data['quality']

    log_message("Model Trainning....")

    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
        "LinearRegression": LinearRegression(),
        "SVC": SVR(),
        "ElasticNet":ElasticNet(alpha=0.5,l1_ratio=0.7,random_state=101)
        }
    
    best_model = None
    best_score = float('-inf') 

    log_message("Training models...")

    for name, model in models.items():
        with mlflow.start_run():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred) 

            log_message(f"{name} -> MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
            mlflow.log_param("model", name)
            mlflow.log_metric("MSE", mse)
            mlflow.log_metric("MAE", mae)
            mlflow.log_metric("R2", r2)
            mlflow.sklearn.log_model(model, "model")

            if r2 > best_score:
                best_model = model
                best_score = r2

    log_message("Saving the Model......")

    with open("models/trainedModel.pkl", "wb") as f:
        pickle.dump(best_model,f)
    log_message("Model training is done..")