In [12]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, roc_auc_score
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

# Load dataset
wine_data = pd.read_csv(r'C:\Users\Lenovo\Downloads\winequality-red.csv')

# Feature Engineering: 11 features input
wine_data['quality_binary'] = (wine_data['quality'] >= 7).astype(int)
wine_data['alcohol_sulphates_interaction'] = wine_data['alcohol'] * wine_data['sulphates']
wine_data['total_acidity'] = wine_data['fixed acidity'] + wine_data['volatile acidity']
wine_data['sulfur_ratio'] = wine_data['free sulfur dioxide'] / (wine_data['total sulfur dioxide'] + 1)

X = wine_data.drop(columns=['quality', 'quality_binary'])
y_regression = wine_data['quality']
y_classification = wine_data['quality_binary']

# Define 11 raw features
X = X[['alcohol', 'sulphates', 'total sulfur dioxide', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'pH']]

# Train-Test Splits
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_classification, test_size=0.2, random_state=42)

# ========================
# Create Full Pipeline Function
# ========================
def create_full_pipeline(model, is_classification=True):
    """
    Creates a full pipeline with preprocessing (scaling, polynomial features, feature selection) and model.
    This pipeline is designed to handle raw 11 features.
    """
    pipeline = Pipeline([
        ("scaler", StandardScaler()),  # Step 1: Scale features
        ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),  # Step 2: Add polynomial features
        ("feature_selection", SelectKBest(f_classif, k=10)),  # Step 3: Select top features
        ("model", model)  # Step 4: Add the regression or classification model
    ])
    return pipeline

# ========================
# Define Models and Parameter Grids
# ========================
# Regression Models
regression_models = {
    "Gradient Boosting Regressor": {
        "model": GradientBoostingRegressor(random_state=42),
        "param_distributions": {
            "n_estimators": randint(50, 200),
            "learning_rate": uniform(0.05, 0.2),
            "max_depth": randint(3, 10),
        },
    },
}

# Classification Models
classification_models = {
    "Gradient Boosting Classifier": {
        "model": GradientBoostingClassifier(random_state=42),
        "param_distributions": {
            "n_estimators": randint(50, 200),
            "learning_rate": uniform(0.05, 0.2),
            "max_depth": randint(3, 10),
        },
    },
}

# ========================
# Randomized Search Tuning
# ========================
def random_search_tune(model_dict, X_train, y_train, is_classification):
    results = {}
    for model_name, model_info in model_dict.items():
        print(f"\nTuning {model_name}...")
        scoring = "roc_auc" if is_classification else "r2"
        search = RandomizedSearchCV(
            model_info["model"],
            model_info["param_distributions"],
            n_iter=20,  # Limit iterations for faster runtime
            cv=3,
            scoring=scoring,
            random_state=42,
            n_jobs=-1,
        )
        search.fit(X_train, y_train)
        results[model_name] = {
            "best_params": search.best_params_,
            "best_score": search.best_score_,
            "best_model": search.best_estimator_,
        }
        print(f"Best Score for {model_name}: {search.best_score_:.4f}")
        print(f"Best Parameters for {model_name}: {search.best_params_}")
    return results

# ========================
# Perform Tuning
# ========================
# Perform Regression Tuning
regression_results = random_search_tune(regression_models, X_train_reg, y_train_reg, is_classification=False)

# Perform Classification Tuning
classification_results = random_search_tune(classification_models, X_train_clf, y_train_clf, is_classification=True)

# ========================
# Evaluate Models
# ========================
def evaluate_model(best_model, X_test, y_test, is_classification):
    y_pred = best_model.predict(X_test)
    if is_classification:
        # For classification, evaluate accuracy and ROC AUC
        accuracy = accuracy_score(y_test, y_pred)
        proba = best_model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, proba)
        print(f"Accuracy: {accuracy:.4f}, ROC AUC: {auc:.4f}")
        return {"Accuracy": accuracy, "ROC AUC": auc}
    else:
        # For regression, evaluate RMSE, MAE, and R^2
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}, R^2: {r2:.4f}")
        return {"RMSE": rmse, "MAE": mae, "R^2": r2}

# Evaluate Regression Model
print("\nEvaluating Best Regression Model...")
evaluate_model(regression_results["Gradient Boosting Regressor"]["best_model"], X_test_reg, y_test_reg, is_classification=False)

# Evaluate Classification Model
print("\nEvaluating Best Classification Model...")
evaluate_model(classification_results["Gradient Boosting Classifier"]["best_model"], X_test_clf, y_test_clf, is_classification=True)

# ========================
# Save the Best Pipelines
# ========================
# Best Regression Model
best_reg_model = regression_results["Gradient Boosting Regressor"]["best_model"]
best_reg_pipeline = create_full_pipeline(best_reg_model, is_classification=False)
best_reg_pipeline.fit(X_train_reg, y_train_reg)  # Fit the pipeline before saving
regression_model_path = "C:/Users/Lenovo/Downloads/best_wine_quality_regression_pipeline.pkl"
joblib.dump(best_reg_pipeline, regression_model_path)
print(f"Best Regression pipeline saved to {regression_model_path}")

# Best Classification Model
best_clf_model = classification_results["Gradient Boosting Classifier"]["best_model"]
best_clf_pipeline = create_full_pipeline(best_clf_model, is_classification=True)
best_clf_pipeline.fit(X_train_clf, y_train_clf)  # Fit the pipeline before saving
classification_model_path = "C:/Users/Lenovo/Downloads/best_wine_quality_classification_pipeline.pkl"
joblib.dump(best_clf_pipeline, classification_model_path)
print(f"Best Classification pipeline saved to {classification_model_path}")



Tuning Gradient Boosting Regressor...
Best Score for Gradient Boosting Regressor: 0.3910
Best Parameters for Gradient Boosting Regressor: {'learning_rate': np.float64(0.07857336358438816), 'max_depth': 5, 'n_estimators': 199}

Tuning Gradient Boosting Classifier...
Best Score for Gradient Boosting Classifier: 0.8904
Best Parameters for Gradient Boosting Classifier: {'learning_rate': np.float64(0.061282315805420054), 'max_depth': 6, 'n_estimators': 87}

Evaluating Best Regression Model...
RMSE: 0.5867, MAE: 0.4474, R^2: 0.4733

Evaluating Best Classification Model...
Accuracy: 0.8969, ROC AUC: 0.9423
Best Regression pipeline saved to C:/Users/Lenovo/Downloads/best_wine_quality_regression_pipeline.pkl
Best Classification pipeline saved to C:/Users/Lenovo/Downloads/best_wine_quality_classification_pipeline.pkl
