In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

## Trabalhando o Dataset

In [15]:
# Carregando o Dataset
X, y = load_boston(return_X_y=True)

# Separando o Dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)
X.shape, y.shape, len(X_train), len(X_test)

((506, 13), (506,), 379, 127)

## Gerando perturbações, criando e treinando os modelos

In [22]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [50]:
df_results = pd.DataFrame(columns=['mae_dt', 'mae_rt', 'mae_knn'])
df_dt      = pd.DataFrame()
df_rt      = pd.DataFrame()
df_knn     = pd.DataFrame()

for i in range(100):
    # Usa 75% da base para criar amostras sem repetição
    # e ignora os outros 25%
    X_sample, _, y_sample, _ = train_test_split(
        X_train, y_train,
        test_size=0.25,
        random_state=i
    )
    
    # DT
    dt = DecisionTreeRegressor()
    dt.fit(X_sample, y_sample)
    y_pred_dt = dt.predict(X_test)
    df_dt['dt%d' % (i)] = y_pred_dt
    mae_dt = mean_absolute_error(y_test, y_pred_dt)

    # RT
    rt = DecisionTreeRegressor(max_features='sqrt', random_state=i)
    rt.fit(X_sample, y_sample)
    y_pred_rt = rt.predict(X_test)
    df_rt['rt%d' % (i)] = y_pred_rt
    mae_rt = mean_absolute_error(y_test, y_pred_rt)
    
    # KNN
    knn = KNeighborsRegressor()
    knn.fit(X_sample, y_sample)
    y_pred_knn = knn.predict(X_test)
    df_knn['knn%d' % (i)] = y_pred_knn
    mae_knn = mean_absolute_error(y_test, y_pred_knn)
    
    df_results.loc[i] = [mae_dt, mae_rt, mae_knn]

In [51]:
df_results.mean(), np.mean(y)

(mae_dt     2.964953
 mae_rt     3.284087
 mae_knn    2.732803
 dtype: float64,
 22.532806324110677)

In [53]:
df_dt.corr().values.mean(), df_rt.corr().values.mean(), df_knn.corr().values.mean() 

(0.8597971913935691, 0.7689792319369024, 0.9774417490699154)