## Librerias y carga de datos

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("..\\data\\processed\\train.csv", index_col=0)

## Train y test

In [4]:
X = df.drop(columns="forks")
y = df["forks"]

X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

## Entrenamiento 

In [5]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

## Predicción

In [6]:
pred = dtr.predict(X_val)
pred

array([ 1.03336299,  0.53892349, -0.71285587, ...,  0.02869217,
        1.2600089 ,  1.48509786])

## Métricas

In [7]:
r2_score(y_val, pred)

0.9984917963583747

In [8]:
mean_absolute_error(y_val, pred)

0.005362364009867896

In [9]:
mean_squared_error(y_val, pred)

0.00298120664208956

In [10]:
mean_absolute_percentage_error(y_val, pred)

4490719797.670495

## Validación

In [11]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))

train 1.0
val 0.9984917963583747


Parece que el modelo puede tener overfitting, el siguiente paso será mejorar el modelo.

## Gridsearch

In [12]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train, y_train)

In [13]:
grid_search.best_params_

{'max_depth': 5, 'max_features': 10, 'min_samples_leaf': 2, 'splitter': 'best'}

In [14]:
grid_search.best_score_

-0.5147465631684425

In [15]:
grid_search.best_estimator_.score(X_val, y_val)

0.8716552892951364

## Entrenamos el modelo con los parámetros de GridSearch

In [16]:
dtr = DecisionTreeRegressor(max_depth=5, max_features=10, min_samples_leaf=2, splitter='best', random_state=42)
dtr.fit(X_train, y_train)

## Predicción y métricas

In [17]:
pred = dtr.predict(X_val)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.8716552892951364
MAE: 0.3581771405907997
MSE: 0.2536939266490989
MAPE: 232355156845.7216


## Validación

In [18]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))

train 0.8747114181501666
val 0.8716552892951364


## Test

Por último, vamos a cargar los datos para el test para comprobar realmente la precisión del modelo.

In [19]:
test = pd.read_csv("..\\data\\processed\\test.csv", index_col=0)

In [20]:
X_test = test.drop(columns="forks")
y_test = test["forks"]

In [21]:
pred_test = dtr.predict(X_test)
print("r2:", r2_score(y_test, pred_test))
print("MAE:", mean_absolute_error(y_test, pred_test))
print("MSE:", mean_squared_error(y_test, pred_test))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test))

r2: 0.8682381671132654
MAE: 0.3614837841306765
MSE: 0.26113749560079347
MAPE: 104227603047.58994


In [22]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))
print('test',r2_score(y_test, pred_test))

train 0.8747114181501666
val 0.8716552892951364
test 0.8682381671132654


## PCA

In [25]:
param_grid = {
    'n_components': np.arange(2,13)
}

pca = PCA(random_state=42)

grid_search = GridSearchCV(pca,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train)

In [26]:
grid_search.best_params_

{'n_components': 2}

In [28]:
pca = PCA(n_components=2)
pca.fit(X)
X_train_pca = pca.transform(X_train)

In [30]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train_pca, y_train)

In [31]:
grid_search.best_params_

{'max_depth': 5, 'max_features': 2, 'min_samples_leaf': 2, 'splitter': 'best'}

In [29]:
X_val_pca = pca.transform(X_val)

In [32]:
dtr = DecisionTreeRegressor(max_depth=5, max_features=2, min_samples_leaf=2, splitter='best', random_state=42)
dtr.fit(X_train_pca, y_train)

In [33]:
pred_pca = dtr.predict(X_val_pca)
print("r2:", r2_score(y_val, pred_pca))
print("MAE:", mean_absolute_error(y_val, pred_pca))
print("MSE:", mean_squared_error(y_val, pred_pca))
print("MAPE:", mean_absolute_percentage_error(y_val, pred_pca))

r2: 0.5117556343828533
MAE: 0.6397309740963262
MSE: 0.9650933770270161
MAPE: 37243134665.53601


In [34]:
X_test_pca = pca.transform(X_test)
pred_test_pca = dtr.predict(X_test_pca)
print("r2:", r2_score(y_test, pred_test_pca))
print("MAE:", mean_absolute_error(y_test, pred_test_pca))
print("MSE:", mean_squared_error(y_test, pred_test_pca))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test_pca))

r2: 0.4996299310457587
MAE: 0.6387351005158157
MSE: 0.9916785750288537
MAPE: 82461020394.31128


Vemos que tras aplicar PCA, el modelo no mejora. Por lo tanto nos quedamos con el primer modelo tras el GridSearch