In [None]:
import os

PROJECT_ROOT = os.getcwd()
print("Running from:", PROJECT_ROOT)

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("dataset/winequality-red.csv")

print(df.shape)
print(df.columns)
print(df.isnull().sum())


(1599, 12)
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [3]:
print("\nTarget variable: quality")
print("\nMissing values per column:")
print(df.isnull().sum())


Target variable: quality

Missing values per column:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [4]:
X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train/Test split completed.")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Train/Test split completed.
Training samples: 1279
Testing samples: 320


In [5]:
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(
    X, y, test_size=0.3, random_state=42
)

rf_70 = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42
)

rf_70.fit(X_train_70, y_train_70)

pred_70 = rf_70.predict(X_test_30)

mse_70 = mean_squared_error(y_test_30, pred_70)
r2_70 = r2_score(y_test_30, pred_70)

print("EXP-05: Random Forest (100 trees, depth=15) with 70/30 split")
print("MSE:", mse_70)
print("R² Score:", r2_70)


EXP-05: Random Forest (100 trees, depth=15) with 70/30 split
MSE: 0.34852263619253543
R² Score: 0.45030007476255884


In [None]:
import os, json, joblib

BASE_DIR = os.getcwd()
print("Saving outputs to:", BASE_DIR)

# Create required Jenkins folder
os.makedirs(os.path.join(BASE_DIR, "app/artifacts"), exist_ok=True)

# If you want to use r2 as accuracy:
accuracy = float(r2_70)

metrics = {
    "accuracy": accuracy
}

# Save metrics in required location
with open(os.path.join(BASE_DIR, "app/artifacts/metrics.json"), "w") as f:
    json.dump(metrics, f, indent=4)

joblib.dump(rf_70, os.path.join(BASE_DIR, "app/artifacts/model.pkl"))

print("Model and metrics saved successfully!")


Model and metrics saved.
