##Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import FitFailedWarning


## Importing Data

In [2]:
data = pd.read_csv('wineq.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Algorithm Selection

In [4]:

models = [
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier())
]


## Cross-Validation and Model Evaluation

In [5]:
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
results = {}
for name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    results[name] = scores

## Final Selection

In [6]:
final_scores = {}
for name, model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    final_scores[name] = accuracy

best_model = max(final_scores, key=final_scores.get)
for name, accuracy in final_scores.items():
    print(f"{name} Accuracy: {accuracy}")
print("Best Model:", best_model, final_scores[best_model])



Random Forest Accuracy: 0.746875
SVM Accuracy: 0.503125
Logistic Regression Accuracy: 0.63125
Decision Tree Accuracy: 0.690625
Best Model: Random Forest 0.746875


## Hyperparameter Optimization

In [7]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Hyperparameter Optimization for Logistic Regression
logreg_param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

logreg_optimized = GridSearchCV(LogisticRegression(max_iter=100), param_grid=logreg_param_grid, cv=5, scoring='accuracy')
logreg_optimized.fit(X_train, y_train)
best_logreg_model = logreg_optimized.best_estimator_
logreg_accuracy = best_logreg_model.score(X_test, y_test)

print("Best Logistic Regression Model:")
print("Hyperparameters:", best_logreg_model.get_params())
print("Accuracy:", logreg_accuracy)
print("\n")

# Hyperparameter Optimization for Decision Tree
dt_param_grid = {
    'max_depth': [None, 10, 20, 30, 50, 100],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 7]
}

dt_optimized = GridSearchCV(DecisionTreeClassifier(), param_grid=dt_param_grid, cv=5, scoring='accuracy')
dt_optimized.fit(X_train, y_train)
best_dt_model = dt_optimized.best_estimator_
dt_accuracy = best_dt_model.score(X_test, y_test)

print("Best Decision Tree Model:")
print("Hyperparameters:", best_dt_model.get_params())
print("Accuracy:", dt_accuracy)
print("\n")

# Hyperparameter Optimization for Random Forest
rf_param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_optimized = GridSearchCV(RandomForestClassifier(), param_grid=rf_param_grid, cv=5, scoring='accuracy')
rf_optimized.fit(X_train, y_train)
best_rf_model = rf_optimized.best_estimator_
rf_accuracy = best_rf_model.score(X_test, y_test)

print("Best Random Forest Model:")
print("Hyperparameters:", best_rf_model.get_params())
print("Accuracy:", rf_accuracy)
print("\n")

# Hyperparameter Optimization for SVM
svm_param_grid = {
    'C': [0.1, 0.8, 2, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

svm_optimized = GridSearchCV(SVC(), param_grid=svm_param_grid, cv=5, scoring='accuracy')
svm_optimized.fit(X_train, y_train)
best_svm_model = svm_optimized.best_estimator_
svm_accuracy = best_svm_model.score(X_test, y_test)

print("Best SVM Model:")
print("Hyperparameters:", best_svm_model.get_params())
print("Accuracy:", svm_accuracy)
print("\n")

75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------------------

Best Logistic Regression Model:
Hyperparameters: {'C': 0.5, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Accuracy: 0.625


Best Decision Tree Model:
Hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 30, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Accuracy: 0.665625


Best Random Forest Model:
Hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_l

## Final Selection after GridSeach



In [8]:

final_scores = {
    'Random Forest': rf_accuracy,
    'SVM': svm_accuracy,
    'Logistic Regression': logreg_accuracy,
    'Decision Tree': dt_accuracy
}

best_model = max(final_scores, key=final_scores.get)

for name, accuracy in final_scores.items():
    print(f"{name} Accuracy: {accuracy}")

print("Best Model:", best_model, final_scores[best_model])


Random Forest Accuracy: 0.703125
SVM Accuracy: 0.6375
Logistic Regression Accuracy: 0.625
Decision Tree Accuracy: 0.665625
Best Model: Random Forest 0.703125


In [9]:
!pip install scikit-optimize
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

data = pd.read_csv('wineq.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Outer loop for nested cross-validation
outer_cv = 8
inner_cv = 8
outer_scores = {}

# Algorithm Selection
models = [
    ('SVM', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier())
]

# Hyperparameter Optimization using Bayesian Optimization
param_grids = {
    'SVM': {
         'C': Real(1e-5, 1e+5, prior='log-uniform'),
        'kernel': Categorical(['linear', 'rbf', 'poly', 'sigmoid'])
    },
    'Random Forest': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(10, 30),
        'min_samples_split': Integer(1, 10),
        'min_samples_leaf': Integer(1, 10)
    },
    'Logistic Regression': {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'penalty': Categorical(['l1', 'l2']),
        'solver': Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    },
    'Decision Tree': {
        'max_depth': Integer(1, 100),
        'min_samples_split': Integer(2, 15),
        'min_samples_leaf': Integer(1, 7)
    }
}

for name, model in models:
    try:
        outer_scores[name] = []
        for outer_train_index, outer_test_index in StratifiedKFold(n_splits=outer_cv, shuffle=True, random_state=0).split(X, y):
            X_outer_train, X_outer_test = X[outer_train_index], X[outer_test_index]
            y_outer_train, y_outer_test = y[outer_train_index], y[outer_test_index]

            # Inner loop for hyperparameter tuning
            opt = BayesSearchCV(
                model,
                param_grids[name],
                n_iter=10,
                cv=StratifiedKFold(n_splits=inner_cv, shuffle=True, random_state=0),
                scoring='accuracy',
                n_jobs=-1,
                random_state=0
            )
            opt.fit(X_outer_train, y_outer_train)
            best_model = opt.best_estimator_

            # Evaluate on the outer test set
            accuracy = best_model.score(X_outer_test, y_outer_test)
            outer_scores[name].append(accuracy)

            print(f"{name} - Tuned Hyperparameters: {opt.best_params_}, Accuracy: {accuracy}")

    except Exception as e:
        print(f"Bayesian Optimization for {name} raised an exception: {e}")

# Display results
for name, scores in outer_scores.items():
    mean_accuracy = np.nanmean(scores)  # Handling NaN values
    print(f"{name} Outer CV Mean Accuracy: {mean_accuracy}")

best_outer_model_name = max(outer_scores, key=outer_scores.get)
print("Best Outer Model:", best_outer_model_name, np.mean(outer_scores[best_outer_model_name]))


Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.7-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.7 scikit-optimize-0.9.0
SVM - Tuned Hyperparameters: OrderedDict([('C', 46915.05705722399), ('kernel', 'rbf')]), Accuracy: 0.635
SVM - Tuned Hyperparameters: OrderedDict([('C', 1.9360537965638225), ('kernel', 'linear')]), Accuracy: 0.59
SVM - Tuned Hyperparameters: OrderedDict([('C', 46915.05705722399), ('kernel', 'rbf')]), Accuracy: 0.66
SVM - Tuned Hyperparameters: OrderedDict([('C', 46915.05705722399), ('kernel', 'rbf')]), Accuracy: 0.61
SVM - Tuned Hyperparameters: OrderedDict([('C', 46915.05705722399), ('kernel', 'rbf')]), Accuracy: 0.6
SVM - Tuned Hyperparameters: OrderedDict([('C', 469

  mean_accuracy = np.nanmean(scores)  # Handling NaN values


In [27]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

model_results = [outer_scores[model_name] for model_name in outer_scores]

flat_results = [score for scores in model_results for score in scores]


labels = []
for model_name, scores in outer_scores.items():
    labels.extend([model_name] * len(scores))


tukey_results = pairwise_tukeyhsd(np.array(flat_results), labels, alpha=0.05)
from tabulate import tabulate
tukey_df = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0])
print(tabulate(tukey_df, headers='keys', tablefmt='pretty', showindex=False))

+---------------+---------------+----------+--------+---------+---------+--------+
|    group1     |    group2     | meandiff | p-adj  |  lower  |  upper  | reject |
+---------------+---------------+----------+--------+---------+---------+--------+
| Decision Tree | Random Forest |  0.0888  | 0.0007 | 0.0382  | 0.1394  |  True  |
| Decision Tree |      SVM      | -0.0069  | 0.9373 | -0.0575 | 0.0437  | False  |
| Random Forest |      SVM      | -0.0957  | 0.0003 | -0.1463 | -0.0451 |  True  |
+---------------+---------------+----------+--------+---------+---------+--------+
