In [206]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_squared_error

In [207]:
data_wine = load_wine(as_frame=True)
df_wine = data_wine.frame

df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [208]:
zero_counts = (df_wine.iloc[:, :-1] == 0).sum()
print(zero_counts)

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64


In [209]:
df_wine.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [210]:
X = df_wine.drop('target', axis=1)
y = df_wine['target']

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [212]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [213]:
# Train a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred = dt.predict(X_test_scaled)

In [214]:
print("Decision Tree Classifier Results:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Decision Tree Classifier Results:
Accuracy: 0.9629629629629629
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.95      1.00      0.98        21
           2       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.96      0.96      0.96        54



In [215]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

In [216]:
print("Random Forest Tree Classifier Results:")
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Tree Classifier Results:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [217]:
f1_decision_t = f1_score(y_test,y_pred,average="weighted")
f1_random_f = f1_score(y_test,y_pred_rf,average="weighted")
print("Decision Tree F1 score: ", f1_decision_t)
print("Random Forest Tree F1 score: ", f1_random_f)


Decision Tree F1 score:  0.9628353590455226
Random Forest Tree F1 score:  1.0


    **"n_estimators", "max_depth", and "min_samples_split" are Hyperparamters for Random Forest Classification**

In [218]:
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': [50, 100, 200, 300],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Hyperparameters: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 500}


In [219]:
X_dr = df_wine.drop('alcohol', axis=1)
y_dr = df_wine['alcohol']

In [220]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_dr, y_dr, test_size=0.3, random_state=42)

In [221]:
scaler = StandardScaler()
X_train_scaled_r = scaler.fit_transform(X_train_r)
X_test_scaled_r = scaler.transform(X_test_r)

In [222]:
reg_dt = DecisionTreeRegressor(random_state=42)
reg_dt.fit(X_train_scaled_r, y_train_r)
y_pred_dt = reg_dt.predict(X_test_scaled_r)

In [223]:
print("\nDecision Tree Regressor Results:")
mse_r = mean_squared_error(y_test_r, y_pred_dt)
print(f"Mean Squared Error: {mse_r}")


Decision Tree Regressor Results:
Mean Squared Error: 0.38300370370370396


In [224]:
random_f_reg = RandomForestRegressor(random_state=42)
random_f_reg.fit(X_train_scaled_r, y_train_r)
y_pred_rf = random_f_reg.predict(X_test_scaled_r)

In [225]:
print("\nRandom Forest Regressor Results:")
mse_rf = mean_squared_error(y_test_r, y_pred_rf)
print(f"Mean Squared Error: {mse_rf}")


Random Forest Regressor Results:
Mean Squared Error: 0.1728992790740739


In [228]:
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': [50, 100, 200,300],
    'min_samples_split': [2, 5, 10],
}

random_search  = RandomizedSearchCV(estimator=random_f_reg, param_distributions=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
random_search.fit(X_train_scaled_r, y_train_r)

# Best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 2000, 'min_samples_split': 10, 'max_depth': 100}
