In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [None]:
from labs.lab_3.util.constants import TRAIN_FILES, TEST_FILES
from labs.util.file_processing.configuration import TRAIN_DATASET_FOLDER, TEST_DATASET_FOLDER
from labs.lab_3.util import loader
from sklearn.model_selection import RepeatedStratifiedKFold
train_path: str = f"/Users/astronely/PycharmProjects/DataAnalysisLabs/data-in/lab_3/{TRAIN_DATASET_FOLDER}"
test_path: str = f"/Users/astronely/PycharmProjects/DataAnalysisLabs/data-in/lab_3/{TEST_DATASET_FOLDER}"

atomic_dataframe = loader.load_atomic_dataframe(train_path, TRAIN_FILES, test_path, TEST_FILES)
X_train = atomic_dataframe.train
y_train = atomic_dataframe.train_target
X_test = atomic_dataframe.test
y_test = atomic_dataframe.test_target

In [None]:
# feature selection
def select_features(_X_train, _y_train, _X_test):
     # configure to select a subset of features
     _fs = SelectKBest(score_func=mutual_info_regression, k=28)
     # learn relationship from training data
     _fs.fit(_X_train, _y_train)
     # transform train input data
     _X_train_fs = _fs.transform(_X_train)
     # transform test input data
     _X_test_fs = _fs.transform(_X_test)
     return _X_train_fs, _X_test_fs, _fs

In [None]:
# # feature selection
# X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# # fit the model
# model = DecisionTreeRegressor()
# model.fit(X_train_fs, y_train)
# # evaluate the model
# yhat = model.predict(X_test_fs)
# # evaluate predictions
# mae = mean_absolute_error(y_test, yhat)
# print('MAE: %.3f' % mae)

# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
params = {'criterion': ['friedman_mse'],
          'max_depth': [i for i in range(10, 11, 5)],
          'max_leaf_nodes': [i for i in range(50, 51, 10)],
          'min_samples_split': [i for i in range(60, 61, 10)]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)
# init GridSearchCV based on cross-validator and params
model = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=params, cv=cv, n_jobs=1)

model.fit(X=X_train_fs, y=y_train)

In [None]:
import numpy as np
from sklearn import metrics

prediction = model.predict(X_test_fs)
print(f"R-Score: {model.best_score_}\n"
      f"mean_absolute_error: {metrics.mean_absolute_error(y_test, prediction)}\n"
      f"mean_squared_error: {metrics.mean_squared_error(y_test, prediction)}\n"
      f"root_mean_squared_error: {np.sqrt(metrics.mean_squared_error(y_test, prediction))}")

In [None]:
model.best_estimator_.features_importances_