In [6]:

# I have tried to do the AutoML part with auto-sklearn but for some reasons I was facing an installation error in Google Collab platform and I have used tpot instead

# In the second part, I have used optuna for hyperparameter tunning.

!pip install tpot
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv('wineq.csv')
X = data.drop('quality', axis=1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tpot = TPOTClassifier(
    generations=2,
    population_size=5,
    random_state=0,
    verbosity=3,
    config_dict={
        'sklearn.tree.DecisionTreeClassifier': {},
        'sklearn.ensemble.RandomForestClassifier': {},
        'sklearn.ensemble.GradientBoostingClassifier': {},
    },
)


tpot.fit(X_train, y_train)

y_pred = tpot.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

tpot.export('best_model_pipeline.py')

Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/87.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected 

Optimization Progress:   0%|          | 0/15 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.6559957107843137	RandomForestClassifier(input_matrix)
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 2 - Current Pareto front scores:

-1	0.6559957107843137	RandomForestClassifier(input_matrix)
Accuracy: 0.7125


In [5]:
!pip install optuna
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import optuna
import optuna.visualization as optuna_viz
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('wineq.csv')
X = data.drop('quality', axis=1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'SVM'])
    if classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 2, 40, log=True)
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    else:
        C = trial.suggest_loguniform('C', 1e-5, 1e5)
        gamma = trial.suggest_loguniform('gamma', 1e-5, 1e5)
        clf = SVC(C=C, gamma=gamma, random_state=42)

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Hyperparameters for numeric transformer
    num_imputer_strategy = trial.suggest_categorical('num_imputer_strategy', ['mean', 'median', 'most_frequent'])
    num_scaler = trial.suggest_categorical('num_scaler', ['StandardScaler', 'MinMaxScaler'])

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=num_imputer_strategy)),
        ('scaler', StandardScaler() if num_scaler == 'StandardScaler' else MinMaxScaler())
    ])

    # Hyperparameters for categorical transformer
    cat_imputer_strategy = trial.suggest_categorical('cat_imputer_strategy', ['most_frequent', 'constant'])
    cat_encoder_handle_unknown = trial.suggest_categorical('cat_encoder_handle_unknown', ['error', 'ignore'])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=cat_imputer_strategy)),
        ('onehot', OneHotEncoder(handle_unknown=cat_encoder_handle_unknown))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

optuna_viz.plot_optimization_history(study)

optuna_viz.plot_slice(study)




[I 2023-12-03 08:23:31,388] A new study created in memory with name: no-name-85dc7d48-052e-40be-b7e9-a98700c0fd1c
  C = trial.suggest_loguniform('C', 1e-5, 1e5)
  gamma = trial.suggest_loguniform('gamma', 1e-5, 1e5)
[I 2023-12-03 08:23:31,984] Trial 0 finished with value: 0.65 and parameters: {'classifier': 'SVM', 'C': 703.271630279217, 'gamma': 4.331411482135114, 'num_imputer_strategy': 'median', 'num_scaler': 'MinMaxScaler', 'cat_imputer_strategy': 'constant', 'cat_encoder_handle_unknown': 'error'}. Best is trial 0 with value: 0.65.
[I 2023-12-03 08:23:32,301] Trial 1 finished with value: 0.73125 and parameters: {'classifier': 'RandomForest', 'n_estimators': 79, 'max_depth': 11, 'num_imputer_strategy': 'most_frequent', 'num_scaler': 'MinMaxScaler', 'cat_imputer_strategy': 'most_frequent', 'cat_encoder_handle_unknown': 'ignore'}. Best is trial 1 with value: 0.73125.
[I 2023-12-03 08:23:32,382] Trial 2 finished with value: 0.61875 and parameters: {'classifier': 'RandomForest', 'n_estim

Accuracy: 0.7375
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 101, 'max_depth': 11, 'num_imputer_strategy': 'median', 'num_scaler': 'MinMaxScaler', 'cat_imputer_strategy': 'constant', 'cat_encoder_handle_unknown': 'ignore'}
