In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parents[1] 
sys.path.append(str(ROOT / "src"))

df = pd.read_csv("../../data/wineQT.csv")

In [11]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from gdLinear import GDLinearReg

X = df[["alcohol"]].values
y = df["quality"].values

# standarize X (important for gradient descent)
X = (X - X.mean()) / X.std()

# make 5 splits with 20% test data
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

results = []
preds = []

for fold, (train_idx, test_idx) in enumerate(cv.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # train model
    model = GDLinearReg(lr=0.01, n_iter=5000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluate
    mse = mean_squared_error(y_test, y_pred) # Mean Squared Error
    rmse = np.sqrt(mse) # Root Mean Squared Error
    r2 = r2_score(y_test, y_pred) # R^2 Score (how much of the variance is explained by the model, 1 is perfect, 0 means no better than mean, negative means worse than mean)

    results.append({"fold": fold+1, "MSE": mse, "RMSE": rmse, "R2": r2})
    preds.extend(
        {"fold": fold+1, "y_test": float(t), "y_pred": float(p)}
        for t, p in zip(y_test, y_pred)
    )


# add to dataframe for overview
results_df = pd.DataFrame(results)
preds_df = pd.DataFrame(preds, columns=["fold", "y_test", "y_pred"])
print(results_df)
print("Mean:\n", results_df.mean(numeric_only=True)) #mean of each column
print("Variance:\n", results_df.var(numeric_only=True))

results_df.to_csv("../outputs/metrics/alcoholGDResults.csv", index=False)
preds_df.to_csv("../outputs/metrics/alcoholGDPreds.csv", index=False)

   fold       MSE      RMSE        R2
0     1  0.417547  0.646179  0.249654
1     2  0.550013  0.741628  0.165416
2     3  0.633242  0.795765  0.101810
3     4  0.506718  0.711841  0.142979
4     5  0.564604  0.751401  0.309924
Mean:
 fold    3.000000
MSE     0.534425
RMSE    0.729363
R2      0.193957
dtype: float64
Variance:
 fold    2.500000
MSE     0.006337
RMSE    0.003068
R2      0.007114
dtype: float64


In [7]:
X = df[["chlorides"]].values
y = df["quality"].values

# standarize X (important for gradient descent)
X = (X - X.mean()) / X.std()

# make 5 splits with 20% test data
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

results = []

for fold, (train_idx, test_idx) in enumerate(cv.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # train model
    model = GDLinearReg(lr=0.01, n_iter=5000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluate
    mse = mean_squared_error(y_test, y_pred) # Mean Squared Error
    rmse = np.sqrt(mse) # Root Mean Squared Error
    r2 = r2_score(y_test, y_pred) # R^2 Score (how much of the variance is explained by the model, 1 is perfect, 0 means no better than mean, negative means worse than mean)

    results.append({"fold": fold+1, "MSE": mse, "RMSE": rmse, "R2": r2})

# add to dataframe for overview
results_df = pd.DataFrame(results)
print(results_df)
print("Mean:\n", results_df.mean(numeric_only=True)) #mean of each column
print("Variance:\n", results_df.var(numeric_only=True))

   fold       MSE      RMSE        R2
0     1  0.559137  0.747755 -0.004787
1     2  0.641522  0.800951  0.026561
2     3  0.694106  0.833130  0.015482
3     4  0.584760  0.764696  0.010985
4     5  0.795159  0.891717  0.028133
Mean:
 fold    3.000000
MSE     0.654937
RMSE    0.807650
R2      0.015275
dtype: float64
Variance:
 fold    2.500000
MSE     0.008870
RMSE    0.003298
R2      0.000178
dtype: float64


In [None]:
#Alcohol as predictor:
#The results show that using alcohol alone as a predictor gives a mean MSE of about 0.53, RMSE of 0.73, and an average R² of around 0.19 across the five folds. 
#This indicates that alcohol explains roughly 19% of the variation in wine quality, which makes it a moderately useful predictor. 
#The relatively low RMSE (compared to the 0–10 scale of quality) suggests the model captures some signal, but still leaves substantial error unexplained.

#Chlorides as predictor:
#When using chlorides, the mean MSE increases to about 0.65, RMSE to 0.81, and the average R² drops close to 0.01.
#This means chlorides explain almost none of the variation in quality. 
#The model essentially predicts close to the average quality regardless of chloride level, making chlorides a very weak predictor.

In [None]:
#The models underfit. Both alcohol and chlorides as single predictors achieve very low mean R² values (0.19 for alcohol and 0.02 for chlorides). 
#This shows that the models capture only a small fraction of the variation in wine quality. 
#Since wine quality depends on many different physicochemical features, using only one variable is too simplistic and leads to underfitting: the model is too limited to represent the true complexity of the data.

In [None]:
#Mean and variance of the metrics across folds:
#Alcohol:
#Mean MSE: ~0.53, RMSE: ~0.73, R²: ~0.19
#Variance of MSE, RMSE, R² are all low, indicating consistent performance across folds.
#Chlorides:
#Mean MSE: ~0.65, RMSE: ~0.81, R²: ~0.02
#Variance of MSE, RMSE, R² are also low here, indicating consistent performance across folds.

#Both features yield stable results across folds, but alcohol consistently performs much better than chlorides. 
#However, the relatively low R² values in both cases indicate that neither feature alone is sufficient for accurate prediction.