In [13]:
import ast
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import KFold
import joblib
from sklearn.model_selection import GridSearchCV


In [14]:
# abrindo os dados de treinamento
df = pd.read_csv("./../ansatz_result/data.csv")
X = df.drop(columns=["target"]).to_numpy()
y = pd.DataFrame(df['target'].apply(ast.literal_eval).tolist()).to_numpy()

In [15]:
# criando kfold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Definir os parâmetros a testar
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30],
    'min_samples_leaf': [1, 2, 4, 5,  10, 15, 20, 30, 35, 40],
    'criterion': ['squared_error', 'friedman_mse']
}

dt_regressor = DecisionTreeRegressor(random_state=45)

grid_search = GridSearchCV(dt_regressor, param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=12)

grid_search.fit(X, y)

# Melhores parâmetros
print("Melhores parâmetros:", grid_search.best_params_)

Melhores parâmetros: {'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 25}


In [16]:
# Usar os melhores parâmetros encontrados para validação cruzada
best_dt_regressor = grid_search.best_estimator_

score = cross_val_score(best_dt_regressor, X, y, cv=kf, scoring='neg_mean_squared_error')
print("Scores:", score)
print("Média:", score.mean())
print("Desvio padrão:", score.std())


Scores: [-0.00089987 -0.00084969 -0.00095836]
Média: -0.000902640345476667
Desvio padrão: 4.4407732441965074e-05


In [17]:
# treinando o dt regressor e salvando o modelo 
best_dt_regressor.fit(X, y)

# salvar o modelo treinado
joblib.dump(best_dt_regressor, './models_salvos/dt_regressor.joblib')

['./models_salvos/dt_regressor.joblib']