In [1]:
import pandas as pd
df_train=pd.read_csv("train.csv")

In [2]:
# Separate features and target
X_train = df_train.drop(columns=["label"])
y_train = df_train["label"]

In [3]:
%pip install hyperopt

Note: you may need to restart the kernel to use updated packages.


In [4]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [5]:
# Objective function
def objective_dt(params):
    clf = DecisionTreeClassifier(
        max_depth=int(params['max_depth']),
        criterion=params['criterion'],
        random_state=42
    )
    score = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)).mean()
    return {'loss': -score, 'status': STATUS_OK}

In [6]:

# Hyperparameter space 
space_dt = {
    'max_depth': hp.quniform('max_depth', 5, 30, 1),
    'criterion': hp.choice('criterion', ['gini', 'entropy'])
}

rng=np.random.default_rng(42)
trials = Trials()
best_dt = fmin(fn=objective_dt,
            space=space_dt,
            algo=tpe.suggest,
            max_evals=100,  
            trials=trials,
            rstate=rng
            )

print("Best hyperparameters:", best_dt)


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [9:12:56<00:00, 331.77s/trial, best loss: -0.9435206575395764] 
Best hyperparameters: {'criterion': 1, 'max_depth': 27.0}


In [7]:
import joblib

In [8]:
# Convert numeric parameters to int
best_dt = {k: int(v) if isinstance(v, (np.integer, np.floating)) else v for k, v in best_dt.items()}

In [9]:
best_dt['criterion']='entropy'

In [10]:
joblib.dump(best_dt, "best_dt.pkl")

['best_dt.pkl']