In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # sobe um nível a partir da pasta models/

In [2]:
import ast
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import KFold
import joblib
from sklearn.model_selection import GridSearchCV

from features_selection.Wrapper import ForwardFeatureSelector, BackwardFeatureSelector


In [3]:
# abrindo os dados de treinamento
df = pd.read_csv("./../ansatz_result/data.csv")
X = df.drop(columns=["target"]).to_numpy()
y = pd.DataFrame(df['target'].apply(ast.literal_eval).tolist()).to_numpy()

In [4]:
# criando kfold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Definir os parâmetros a testar
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30],
    'min_samples_leaf': [1, 2, 4, 5,  10, 15, 20, 30, 35, 40],
    'criterion': ['squared_error', 'friedman_mse']
}

dt_regressor = DecisionTreeRegressor(random_state=45)

grid_search = GridSearchCV(dt_regressor, param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=12)

grid_search.fit(X, y)

# Melhores parâmetros
print("Melhores parâmetros:", grid_search.best_params_)

Melhores parâmetros: {'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 25}


In [5]:
# Usar os melhores parâmetros encontrados para validação cruzada
best_dt_regressor = grid_search.best_estimator_

score = cross_val_score(best_dt_regressor, X, y, cv=kf, scoring='neg_mean_squared_error')
print("Scores:", score)
print("Média:", score.mean())
print("Desvio padrão:", score.std())


Scores: [-0.00089987 -0.00084969 -0.00095836]
Média: -0.000902640345476667
Desvio padrão: 4.4407732441965074e-05


In [6]:
# treinando o dt regressor e salvando o modelo 
best_dt_regressor.fit(X, y)

# salvar o modelo treinado
joblib.dump(best_dt_regressor, './models_salvos/dt_regressor.joblib')

['./models_salvos/dt_regressor.joblib']

In [7]:
selector = ForwardFeatureSelector(
    model=best_dt_regressor,
    model_type='sklearn',
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1
)

In [8]:
X_new = selector.fit_transform(X, y)

Testing feature set: [0], score: -0.0009699440743350345
Testing feature set: [1], score: -0.0009796132118767846
Testing feature set: [2], score: -0.001821383761679551
Testing feature set: [3], score: -0.001401001302248639
Testing feature set: [4], score: -0.001388858702869208
Testing feature set: [5], score: -0.0011159469447380788
Testing feature set: [6], score: -0.00111987964064775
Testing feature set: [7], score: -0.0012611252268093715
Testing feature set: [8], score: -0.005322733305281623
Testing feature set: [9], score: -0.0010425302330478673
Testing feature set: [10], score: -0.009445114723652413
Testing feature set: [11], score: -0.0013419189561402433
Testing feature set: [12], score: -0.0011478828453696664
Testing feature set: [13], score: -0.0009500432432162353
Testing feature set: [14], score: -0.001225864719675134
Testing feature set: [15], score: -0.010329371303058511
Testing feature set: [16], score: -0.006093665918597443
Testing feature set: [17], score: -0.01699101830129

In [9]:
print(f"features selecionadas: {selector.selected_features}, best score: {selector.best_scores[-1]}")

features selecionadas: [13, 12, 9, 16, 3, 17, 7], best score: -0.0008195523276130927


In [10]:
selector2 = BackwardFeatureSelector(
    model=best_dt_regressor,
    model_type='sklearn',
    scoring='neg_mean_squared_error',
    cv = 3,
    verbose=1
)

In [11]:
X_new2 = selector2.fit_transform(X, y)

Testando subconjunto: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008660372349773138
Testando subconjunto: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008554387214787606
Testando subconjunto: [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008623413589716205
Testando subconjunto: [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008910220698570268
Testando subconjunto: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008897155542311011
Testando subconjunto: [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008504228793504531
Testando subconjunto: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], score: -0.0008572047717204061
Testando subconjunto: [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [12]:
print(f"features selecionadas: {selector2.selected_features}, best score: {selector2.best_scores[-1]}")

features selecionadas: [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], best score: -0.0008504228793504531
