# Intro to Machine Learning Code Assinment 3
# Duncan Conly ID # 010958878

# Classification With Random Forest and Boosting Methods on wine.csv

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score, precision_score, recall_score
from tabulate import tabulate

# loading the data set
file_path = "Data/winequality-red.xlsx"
df = pd.read_excel(file_path)

# drop the unneccessary columns
df = df.drop(columns=["Unnamed: 0"], errors = "ignore")

# convert the quality class into three classes 0, 1, 2

df['quality_class'] = df['quality'].apply(lambda q: 0 if q <= 5 else(1 if q == 6 else 2))

df.head()




Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_class
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [3]:
# Define the Target variable

X = df.drop(columns = ["quality","quality_class"])

y = df["quality_class"]

# split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42 )



# Define the Models and Train

In [13]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state = 42)
}

# Hyperparameter tuning grids
param_grids = {
    "Random Forest": {"n_estimators": [50,100,200], "max_depth": [5,10,None], "max_features": ["sqrt", "log2"]},
    "Gradient Boosting": {"n_estimators": [50,100,200], "learning_rate": [0.01, 0.1, 0.2], "max_depth": [3,5,10,20]}
}

# Perform hyperparameter tuning with cross-validation
best_models = {}
cv_scores = {}
best_params = {}

for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1 )
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    cv_scores[name] = -grid_search.best_score_
    best_params[name] = grid_search.best_params_  # Store best hyperparameters


# Create the Tables and make data readable

In [None]:


# Evaluate on test set
test_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    test_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred, average='weighted' ),
        "Precision": precision_score(y_test, y_pred, average='weighted' ),
        "Recall": recall_score(y_test, y_pred, average='weighted' )
    }

# Print Cross-Validation Results
cv_results_table = tabulate(
    [[name, mse] for name, mse in cv_scores.items()],
    headers=["Model", "Mean Squared Error"],
    tablefmt="grid"
)
print("Mean Squared Error Results:\n", cv_results_table)

# Print Test Set Performance
test_results_table = tabulate(
    [[name, results["Accuracy"], results["F1 Score"], results["Precision"], results["Recall"]] for name, results in test_results.items()],
    headers=["Model", "Accuracy", "F1 Score", "Precision", "Recall"],
    tablefmt="grid"
)
print("\nTest Set Performance:\n", test_results_table)

# Print Best Hyperparameters
best_params_table = tabulate(
    [[name, best_params[name]] for name in best_params],
    headers=["Model", "Best Hyperparameters"],
    tablefmt="grid"
)
print("\nBest Hyperparameters for Each Model:\n", best_params_table)





Mean Squared Error Results:
 +-------------------+----------------------+
| Model             |   Mean Squared Error |
| Random Forest     |             0.333811 |
+-------------------+----------------------+
| Gradient Boosting |             0.354151 |
+-------------------+----------------------+

Test Set Performance:
 +-------------------+------------+------------+-------------+----------+
| Model             |   Accuracy |   F1 Score |   Precision |   Recall |
| Random Forest     |     0.7    |   0.698608 |    0.704025 |   0.7    |
+-------------------+------------+------------+-------------+----------+
| Gradient Boosting |     0.6875 |   0.686912 |    0.687315 |   0.6875 |
+-------------------+------------+------------+-------------+----------+

Best Hyperparameters for Each Model:
 +-------------------+----------------------------------------------------------------+
| Model             | Best Hyperparameters                                           |
| Random Forest     | {'ma