# Importing Needed things for this worksheet - **8**

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier , RandomForestRegressor

from sklearn.metrics import f1_score , mean_squared_error , r2_score



# **1. Implementing  Classification Models:**
* To  Train a Decision Tree Classifier and a Random Forest Classifier using scikit-learn.
*  Comparing the models based on their F1 scores.

In [10]:
# Loading Wine dataset
data = load_wine()
X = data.data
y = data.target


# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTrain shape and Test shape:\n")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# Train Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

y_pred_dt = dt_clf.predict(X_test)
f1_dt = f1_score(y_test, y_pred_dt, average="macro")

print("\nDecision Tree F1 Score (Macro):\n", round(f1_dt, 4))

#Train Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
f1_rf = f1_score(y_test, y_pred_rf, average="macro")
print("\nRandom Forest F1 Score (Macro):\n", round(f1_rf, 4))

#Comapring results
print("\n---F1 Score Comparison (Macro)---")
print("\nDecision Tree: ", round(f1_dt, 4))
print("\nRandom Forest: ", round(f1_rf, 4))


Train shape and Test shape:

Train shape: (142, 13)
Test shape: (36, 13)

Decision Tree F1 Score (Macro):
 0.9457

Random Forest F1 Score (Macro):
 1.0

---F1 Score Comparison (Macro)---

Decision Tree:  0.9457

Random Forest:  1.0


# **2. Hyperparameter Tuning:**
* Identifing three hyperparameters of the Random Forest Classifier.
* Performing hyperparameter tuning using GridSearchCV to optimize these parameters.
* Taking hints from the scikit-learn documentation to guide the implementation.

In [14]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator = RandomForestClassifier(random_state=42),
    param_grid = param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
print("\nBest CV F1 (Macro):", round(grid_search.best_score_, 4))

best_rf_clf = grid_search.best_estimator_
y_pred_best = best_rf_clf.predict(X_test)

f1_best = f1_score(y_test, y_pred_best, average="macro")
print("\nTuned Random Forest Test F1 (Macro):", round(f1_best, 4))


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}

Best CV F1 (Macro): 0.9863

Tuned Random Forest Test F1 (Macro): 1.0


# **3. Implement Regression Model:**
* Training a Decision Tree Regressor and a Random Forest Regressor using scikit-learn.
* Identifing three parameters for Random Forest Regressio and Perform hyperparameter tuning using
RandomSearchCV to optimize these parameters.

In [22]:
# Creating Regression Dataset ( Predict Alcohol)
wine = load_wine()
X_full = pd.DataFrame(wine.data, columns=wine.feature_names)

# Target for regression (continuous)
y_reg = X_full["alcohol"].values

# Remove target column from features
X_reg = X_full.drop(columns=["alcohol"]).values

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print("\nRegression Train:", X_train_r.shape, "Test:", X_test_r.shape)

# Train Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train_r, y_train_r)

pred_dt_r = dt_reg.predict(X_test_r)

rmse_dt = np.sqrt(mean_squared_error(y_test_r, pred_dt_r))
r2_dt = r2_score(y_test, pred_dt_r)

print("\n---Decision Tree Regressor---")
print("\nDecision Tree RMSE:", round(rmse_dt, 4))
print("\nDecision Tree R2:", round(r2_dt, 4))

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train_r, y_train_r)

pred_rf_r = rf_reg.predict(X_test_r)

rmse_rf = np.sqrt(mean_squared_error(y_test_r, pred_rf_r))
r2_rf = r2_score(y_test, pred_rf_r)

print("\n---Random Forest Regressor---")
print("\nRandom Forest RMSE:", round(rmse_rf, 4))
print("\nRandom Forest R2:", round(r2_rf, 4))





Regression Train: (142, 12) Test: (36, 12)

---Decision Tree Regressor---

Decision Tree RMSE: 0.5585

Decision Tree R2: -241.459

---Random Forest Regressor---

Random Forest RMSE: 0.3928

Random Forest R2: -242.2254


In [23]:
param_dist = {
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20, 30],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10],
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_r, y_train_r)

print("Best Parameters:", random_search.best_params_)
print("Best CV Score (neg MSE):", round(random_search.best_score_, 4))

best_rf_reg = random_search.best_estimator_
pred_best_r = best_rf_reg.predict(X_test_r)

rmse_best = np.sqrt(mean_squared_error(y_test_r, pred_best_r))
r2_best = r2_score(y_test_r, pred_best_r)

print("Tuned RF RMSE:", round(rmse_best, 4))
print("Tuned RF R2  :", round(r2_best, 4))


Best Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'log2', 'max_depth': None}
Best CV Score (neg MSE): -0.3014
Tuned RF RMSE: 0.4166
Tuned RF R2  : 0.7093
