In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.metrics import accuracy_score,classification_report,mean_absolute_error,mean_squared_error,r2_score

In [44]:
#load dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')

In [45]:
#Define features and target
X=df.drop(columns=['quality'])
y=df['quality']

In [75]:
#convert target into binary classification(Good vs bad)
y_class=y.apply(lambda x:1 if x>=6 else 0)
#split the dataset for classification
X_train_c,X_test_c,Y_train_c,Y_test_c=train_test_split(X,y_class,test_size=0.25,random_state=10)

In [76]:
#classification model
classification_models={
    'Logistic Regression':LogisticRegression(),
    'Ridge Classifier':RidgeClassifier(),
    'Random Forest':RandomForestClassifier(n_estimators=100,random_state=42),
    'Gradient Boosting':GradientBoostingClassifier(n_estimators=100,random_state=42),
    'AdaBoost':AdaBoostClassifier(n_estimators=100,random_state=42),
    'Extra Trees':ExtraTreesClassifier(n_estimators=100,random_state=42),
    'SVM':SVC(kernel='rbf',probability=True),
    'K-Nearest Neighbors':KNeighborsClassifier(n_neighbors=5),
    'Decision Tree':DecisionTreeClassifier(random_state=42)
}

In [77]:
best_classification_model=None
best_classification_accuracy=0
#Iterate  over classification models and evaluate
print("\nClassification Models Evaluation")
for name,model in classification_models.items():
    pipeline=Pipeline([('scaler',StandardScaler()),
                       ('classifier',model)])
    #train the model
    pipeline.fit(X_train_c,Y_train_c)
    #predict
    y_pred_c=pipeline.predict(X_test_c)
    #evaluation metrics
    accuracy=accuracy_score(Y_test_c,y_pred_c)
    report=classification_report(Y_test_c,y_pred_c)
    print(f"\n{name} Classification Performance:")
    print(f"Accuracy:{accuracy:.4f}")
    print("Classification Report:")
    print(report)
    if accuracy>best_classification_accuracy:
        best_classification_accuracy=accuracy
        best_classification_model=name


Classification Models Evaluation

Logistic Regression Classification Performance:
Accuracy:0.7325
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71       189
           1       0.73      0.78      0.76       211

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400


Ridge Classifier Classification Performance:
Accuracy:0.7325
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71       189
           1       0.73      0.78      0.76       211

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400


Random Forest Classification Performance:
Accuracy:0.8150
Classification Report:
              precision    recall  f1-score   support

  

In [78]:
# Split the dataset for regression
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y, test_size=0.25, random_state=42)

# Regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'AdaBoost Regressor': AdaBoostRegressor(n_estimators=100, random_state=42),
    'Extra Trees Regressor': ExtraTreesRegressor(n_estimators=100, random_state=42)
}



In [79]:
best_regression_model=None
best_r2_score=-float("inf")
# Iterate over regression models and evaluate
print("\nRegression Models evaluation")
for name,model in regression_models.items():
    pipeline=Pipeline([('scaler',StandardScaler()),
                      ('regressor',model)])
    pipeline.fit(X_train_r,y_train_r)
    #predict
    y_pred_r=pipeline.predict(X_test_r)
    #evaluation metrics
    mae=mean_absolute_error(y_test_r,y_pred_r)
    mse=mean_squared_error(y_test_r,y_pred_r)
    r2=r2_score(y_test_r,y_pred_r)
    print(f"\n{name} Regression Performance")
    print(f"Mean absolute error:{mae:.4f}")
    print(f"Mean Squared Error:{mse:.4f}")
    print(f"R^2 score:{r2:.4f}")
    
    if r2>best_r2_score:
        best_r2_score=r2
        best_regression_model=name


Regression Models evaluation

Linear Regression Regression Performance
Mean absolute error:0.4999
Mean Squared Error:0.3883
R^2 score:0.3723

Ridge Regression Regression Performance
Mean absolute error:0.4999
Mean Squared Error:0.3883
R^2 score:0.3723

Random Forest Regressor Regression Performance
Mean absolute error:0.4236
Mean Squared Error:0.3100
R^2 score:0.4988

Gradient Boosting Regressor Regression Performance
Mean absolute error:0.4771
Mean Squared Error:0.3586
R^2 score:0.4203

AdaBoost Regressor Regression Performance
Mean absolute error:0.5075
Mean Squared Error:0.3850
R^2 score:0.3776

Extra Trees Regressor Regression Performance
Mean absolute error:0.3859
Mean Squared Error:0.2945
R^2 score:0.5240


In [80]:
# Conclusion
print("\nConclusion:")
print(f"The best classification model is {best_classification_model} with an accuracy of {best_classification_accuracy:.4f}.")
print(f"The best regression model is {best_regression_model} with an R^2 score of {best_r2_score:.4f}.")


Conclusion:
The best classification model is Random Forest with an accuracy of 0.8150.
The best regression model is Extra Trees Regressor with an R^2 score of 0.5240.
