In [None]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.tree import *
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1. Load dataset
df = pd.read_csv('winequality-red.csv', sep=';')

# 2. Merge quality labels into binary classes to handle imbalance
# Class 0: Low quality (<=5), Class 1: High quality (>=6)
df['quality_binary'] = df['quality'].apply(lambda q: 0 if q <= 5 else 1)

# 3. Extract features and binary target
X = df.drop(columns=['quality', 'quality_binary']).values
y = df['quality_binary'].values

# 4. Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 5. Define stratified K-fold cross-validator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 6. Create pipeline: StandardScaler + Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

# 7. Define hyperparameter grid
param_grid = {
    'clf__n_estimators': randint(100, 300),
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': randint(2, 10),
    'clf__min_samples_leaf': randint(1, 10),
    'clf__class_weight': [None, 'balanced']
}

# 8. Perform GridSearchCV
# grid_search = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=cv,
#     n_jobs=-1,
#     verbose=1
# )

# 8. Perform RandomizedSearchCV
grid_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

grid_search.fit(X_train, y_train)

# 9. Evaluate the best model on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 10. Print evaluation report
print("Best Params:", grid_search.best_params_)
print("Best Cross-Validation Score: {:.4f}".format(grid_search.best_score_))
print("\nTest Set Evaluation:")
print(classification_report(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# 11. Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test)
plt.title("Confusion Matrix - Test Set")
plt.show()