In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
import pickle

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


from helpers import target_feature, random_seed, get_originals_without_nan_cols

np.random.seed(random_seed)

## Regression Tree

In [2]:
train_df = pd.read_csv(f'../data/post_fs_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/post_fs_test.csv', index_col=0)

prelim_x_train = train_df.drop(target_feature, axis=1)
prelim_x_test = test_df.drop(target_feature, axis=1)

## Decision tree regressor

Here we will apply a regression tree. Unfortunately, according to https://scikit-learn.org/stable/modules/tree.html#tree, scikit-learn regression trees do not support categorical variables for now (what a pity), therefore we remove categorical features created earlier.

In [3]:
continuous_cols = prelim_x_train.select_dtypes(np.float64).columns.to_list()

x_train = prelim_x_train[continuous_cols]
x_test = prelim_x_test[continuous_cols]

y_train = train_df[target_feature]
y_test = test_df[target_feature]

#### Cross-validation

In [4]:
compute_cv = True

In [5]:
params_grid = {
    'max_depth': [2, 3, 5, 10],
    'min_samples_split': [100, 1_000, 10_000, 100_000],
    'max_features': [1, 2, 5, 10, len(continuous_cols)]
}

In [6]:
if compute_cv:
    regressor = tree.DecisionTreeRegressor()
    grid_search_cv = GridSearchCV(regressor, params_grid, verbose=3)

    cv_fit = grid_search_cv.fit(x_train.values, y_train)

    with open('../models/reg_tree_CV.pkl', 'wb') as f:
        pickle.dump(cv_fit, f)


Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END max_depth=2, max_features=1, min_samples_split=100;, score=0.094 total time=   0.1s
[CV 2/5] END max_depth=2, max_features=1, min_samples_split=100;, score=0.661 total time=   0.1s
[CV 3/5] END max_depth=2, max_features=1, min_samples_split=100;, score=0.639 total time=   0.1s
[CV 4/5] END max_depth=2, max_features=1, min_samples_split=100;, score=0.403 total time=   0.1s
[CV 5/5] END max_depth=2, max_features=1, min_samples_split=100;, score=0.497 total time=   0.1s
[CV 1/5] END max_depth=2, max_features=1, min_samples_split=1000;, score=0.729 total time=   0.2s
[CV 2/5] END max_depth=2, max_features=1, min_samples_split=1000;, score=0.254 total time=   0.1s
[CV 3/5] END max_depth=2, max_features=1, min_samples_split=1000;, score=0.503 total time=   0.1s
[CV 4/5] END max_depth=2, max_features=1, min_samples_split=1000;, score=0.640 total time=   0.1s
[CV 5/5] END max_depth=2, max_features=1, min_samples_split=1

In [7]:
with open('../models/reg_tree_CV.pkl', 'rb') as f:
    cv = pickle.load(f)

### Training the model

Selection of the best parameters from cross-validation, train the tree and store it.

In [8]:
cv.best_params_

{'max_depth': 10, 'max_features': 17, 'min_samples_split': 100}

In [9]:
tree_reg = tree.DecisionTreeRegressor(**cv.best_params_)
tree_fit = tree_reg.fit(x_train.values, y_train)

In [10]:
with open('../models/reg_tree.pkl', 'wb') as f:
    pickle.dump(tree_fit, f)

### Check prediction

In [11]:
y_predict = tree_fit.predict(x_test.values)
print(mean_squared_error(y_test, y_predict)**0.5, r2_score(y_test, y_predict))

0.40083395225524693 0.9906742061130459
