# Wine Quality Modeling
This notebook is adapted to the wine quality dataset. It will try to load `data/wine.csv` or `data/wine_augmented.csv`. If neither exists, it will fall back to scikit-learn's `load_wine` as a placeholder.

In [49]:
# Libraries
import pandas as pd
import numpy as np
from pathlib import Path
from flaml.default import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score,
    precision_score, recall_score, confusion_matrix,
mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression
import joblib
import importlib
import warnings

warnings.filterwarnings('ignore')
print("pandas", pd.__version__)


pandas 2.2.2


In [50]:
# Load dataset: try common paths first
path = Path("../../data/raw_data/winequality-white.csv")
df = None
if path.exists():
    df = pd.read_csv(path)
    print("Loaded from", path)


if df is None:
    # fallback to sklearn's wine dataset (different features, for demo only)
    try:
        from sklearn.datasets import load_wine
        data = load_wine(as_frame=True)
        df = data.frame.copy()
        df['quality'] = df[data.target_names[0]] if False else np.round(df['alcohol']).astype(int)  # placeholder quality
        print("Fallback: used sklearn.load_wine (synthetic 'quality')")
    except Exception as e:
        raise FileNotFoundError("No local wine CSV found and sklearn fallback failed: {}".format(e))

print('Shape:', df.shape)
df.head()

Loaded from ..\..\data\raw_data\winequality-white.csv
Shape: (4898, 12)


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [51]:
# Quick clean & column name harmonization
# Ensure feature names use underscores (e.g., 'fixed_acidity') — if your CSV uses spaces, we try to map common names.
rename_map = {c: c.strip().lower().replace(' ', '_') for c in df.columns}
df = df.rename(columns=rename_map)
print('Columns:', df.columns.tolist())

Columns: ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'quality']


In [52]:
# Check target column
target = "quality"

if target is None:
    raise ValueError('No target column found. Expected: ' + str(target))
print('Using target:', target)

Using target: quality


In [53]:
# Basic EDA: datatypes and nulls
print(df.dtypes)
print('\nNull counts:\n', df.isna().sum())

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

Null counts:
 fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [54]:
# Split X / y
X = df.drop(columns=[target])
y = df[target].astype(float)
print('X shape, y shape:', X.shape, y.shape)

# Simple numeric-only filter: drop non-numeric columns
X = X.select_dtypes(include=[np.number]).copy()
print('Numeric X shape (after select_dtypes):', X.shape)

X shape, y shape: (4898, 11) (4898,)
Numeric X shape (after select_dtypes): (4898, 11)


In [55]:
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(3918, 11) (980, 11)


In [56]:

# Ensure target is integer classes
y_train_cls = y_train.round().astype(int)
y_test_cls = y_test.round().astype(int)

In [57]:

# Optionally map classes to a consecutive 0..K-1 if needed
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_cls)
y_test_enc = le.transform(y_test_cls)

In [58]:






models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "LogisticRegression": LogisticRegression(max_iter=2000, solver="liblinear"),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "XGB_LimitDepth": XGBClassifier(
            n_estimators=300,
            max_depth=6,              # "limitdepth" means shallower trees
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            gamma=0.0,
            reg_lambda=1.0,
            min_child_weight=1,
            use_label_encoder=False,
            eval_metric="logloss",    # avoids warning
            random_state=42,
            n_jobs=-1
        ),
}

# Try to add CatBoost if available
try:
    catboost_spec = importlib.util.find_spec("catboost")
    if catboost_spec is not None:
        from catboost import CatBoostClassifier
        models["CatBoost"] = CatBoostClassifier(iterations=500, learning_rate=0.05, verbose=False, random_state=42)
    else:
        print("CatBoost not installed — skipping. Install it with `pip install catboost` to include it.")
except Exception as e:
    print("CatBoost import error (skipping):", e)

results = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, mdl in models.items():
    print(f"Training & evaluating: {name}")
    # Quick cross-val on training set (balanced accuracy)
    try:
        cv_bal_acc = cross_val_score(mdl, X_train, y_train_enc, cv=cv, scoring="balanced_accuracy", n_jobs=-1)
    except Exception as e:
        cv_bal_acc = np.array([np.nan])
    # Fit on full training set
    mdl.fit(X_train, y_train_enc)
    preds = mdl.predict(X_test)

    acc = accuracy_score(y_test_enc, preds)
    bal_acc = balanced_accuracy_score(y_test_enc, preds)
    f1 = f1_score(y_test_enc, preds, average="macro", zero_division=0)
    prec = precision_score(y_test_enc, preds, average="macro", zero_division=0)
    rec = recall_score(y_test_enc, preds, average="macro", zero_division=0)

    results.append({
        "model": name,
        "cv_bal_acc_mean": float(np.nanmean(cv_bal_acc)),
        "cv_bal_acc_std": float(np.nanstd(cv_bal_acc)),
        "accuracy": float(acc),
        "balanced_accuracy": float(bal_acc),
        "f1_macro": float(f1),
        "precision_macro": float(prec),
        "recall_macro": float(rec),
        "n_classes": len(np.unique(y_train_enc))
    })


Training & evaluating: RandomForest
Training & evaluating: LogisticRegression
Training & evaluating: GradientBoosting
Training & evaluating: XGB_LimitDepth
Training & evaluating: CatBoost


In [59]:
df_results = pd.DataFrame(results).sort_values("f1_macro", ascending=False).reset_index(drop=True)
print("\n=== Model comparison ===")
display(df_results)


=== Model comparison ===


Unnamed: 0,model,cv_bal_acc_mean,cv_bal_acc_std,accuracy,balanced_accuracy,f1_macro,precision_macro,recall_macro,n_classes
0,RandomForest,0.348874,0.018495,0.697959,0.46366,0.505898,0.599465,0.46366,7
1,XGB_LimitDepth,0.350886,0.015509,0.655102,0.453622,0.496654,0.586955,0.453622,7
2,GradientBoosting,0.329248,0.023445,0.604082,0.443106,0.489313,0.586301,0.443106,7
3,CatBoost,0.292765,0.019475,0.595918,0.358989,0.396101,0.528206,0.358989,7
4,LogisticRegression,0.198152,0.005776,0.516327,0.228109,0.210319,0.279256,0.228109,7


In [60]:
# Pick best model by f1_macro and show confusion matrix
best_row = df_results.iloc[0]
best_model_name = best_row["model"]
best_model = models[best_model_name]
print(f"\nBest model: {best_model_name} (F1-macro={best_row['f1_macro']:.4f})")

preds_best = best_model.predict(X_test)
labels_full = np.arange(len(le.classes_))   # e.g. [0,1,2,3,4,5,6]

cm = confusion_matrix(y_test_enc, preds_best, labels=labels_full)
cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
display(cm_df)
print("\nConfusion matrix (rows=true, cols=pred):")
display(cm_df)


Best model: RandomForest (F1-macro=0.5059)


Unnamed: 0,3,4,5,6,7,8,9
3,0,0,2,3,0,0,0
4,0,6,11,8,0,0,0
5,0,4,201,84,2,0,0
6,0,0,61,348,23,0,0
7,0,0,4,72,113,3,0
8,0,0,1,10,8,16,0
9,0,0,0,0,0,0,0



Confusion matrix (rows=true, cols=pred):


Unnamed: 0,3,4,5,6,7,8,9
3,0,0,2,3,0,0,0
4,0,6,11,8,0,0,0
5,0,4,201,84,2,0,0
6,0,0,61,348,23,0,0
7,0,0,4,72,113,3,0
8,0,0,1,10,8,16,0
9,0,0,0,0,0,0,0


In [61]:
# Save best model
out_path = Path("models") / "best_wine_model.joblib"
out_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump({"model": best_model, "label_encoder": le}, out_path)
print("Saved best model + label encoder to", out_path)

Saved best model + label encoder to models\best_wine_model.joblib


In [63]:
# Example: predict on one sample (first test row)
sample = X_test.iloc[0].to_dict()
print('Sample:', sample)
pred = model.predict(pd.DataFrame([sample]))[0]
print('Predicted quality:', pred)

Sample: {'fixed_acidity': 6.0, 'volatile_acidity': 0.29, 'citric_acid': 0.41, 'residual_sugar': 10.8, 'chlorides': 0.048, 'free_sulfur_dioxide': 55.0, 'total_sulfur_dioxide': 149.0, 'density': 0.9937, 'ph': 3.09, 'sulphates': 0.59, 'alcohol': 10.9666666666667}
Predicted quality: 6.85
