Tras haber probado el PCA con el modelo de árbol de decisión no ha mejorado nuestro modelo, pero no quiere decir que no sea un preprocesado útil con otros modelos. Por lo tanto vamos a rotar los datos con PCA y realizar un BaseLine para elegir un modelo con el que utilizar estos datos rotado para ver si mejora la predicción.

## Libreria y carga de datos

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.decomposition import PCA


import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv("..\\data\\processed\\train.csv", index_col=0)

## Train y test

In [7]:
X = df.drop(columns="forks")
y = df["forks"]

X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

## PCA

In [8]:
pca = PCA(n_components=2)
pca.fit(X)
X_train_pca = pca.transform(X_train)

In [9]:
X_val_pca = pca.transform(X_val)

# Baselines

In [10]:
modelos = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor()
}

# Define las métricas a usar
metricas = "r2"

resultados_dict = {}

for modelo in modelos:

    resultados_dict[modelo] = [cross_val_score(modelos[modelo], X_train_pca, y_train, scoring="r2", cv=3).mean()]


pd.DataFrame(resultados_dict).T

Unnamed: 0,0
LinearRegression,0.017665
ElasticNet,0.017072
Ridge,0.017665
Lasso,0.015422
DecisionTree,0.755519
RandomForest,0.852143
AdaBoost,0.330187
GradientBoost,0.620903
HistGradientBoosting,0.736994


In [11]:
VotingR = VotingRegressor(estimators=
                          [
                              ("dt", DecisionTreeRegressor(random_state=42)),
                              ("hgb", HistGradientBoostingRegressor(random_state=42)),
                              ("rf", RandomForestRegressor(random_state=42))
                          ])

In [12]:
VotingR.fit(X_train_pca, y_train)

In [13]:
for name, clf in VotingR.named_estimators_.items():
    print(name, "=", clf.score(X_val_pca, y_val))

dt = 0.7736443324738018
hgb = 0.7512681754703108
rf = 0.8612100075828916


Tras el BaseLine elegimos el ``RandomForest``. 

## Probando el modelo

In [14]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train_pca, y_train)

In [15]:
pred = rfr.predict(X_val_pca)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.8612100075828916
MAE: 0.2312092558664801
MSE: 0.274340702959414
MAPE: 39443365179.50173


In [16]:
print('train',r2_score(y_train, rfr.predict(X_train_pca)))
print('val',r2_score(y_val, pred))

train 0.9810674417170588
val 0.8612100075828916


## Optimización

In [23]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'min_samples_leaf': np.arange(2,5),
}

random_forest_r = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(random_forest_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train, y_train)

In [24]:
grid_search.best_params_

{'max_depth': 5, 'max_features': 8, 'min_samples_leaf': 4}

## Entrenando el modelo optimizado

In [26]:
rfr = RandomForestRegressor(random_state=42 , max_depth=5, max_features=8, min_samples_leaf=4)
rfr.fit(X_train_pca, y_train)

## Predicción y métricas

In [27]:
pred = rfr.predict(X_val_pca)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.5370067813376257
MAE: 0.6321246220217147
MSE: 0.9151804309603819
MAPE: 37055575755.10445


## Validación

In [28]:
print('train',r2_score(y_train, rfr.predict(X_train_pca)))
print('val',r2_score(y_val, pred))

train 0.5373190664676952
val 0.5370067813376257


## Prueba con test set

In [29]:
test = pd.read_csv("..\\data\\processed\\test.csv", index_col=0)

In [30]:
X_test = test.drop(columns="forks")
y_test = test["forks"]

In [31]:
X_test_pca = pca.transform(X_test)

In [32]:
pred_test = rfr.predict(X_test_pca)
print("r2:", r2_score(y_test, pred_test))
print("MAE:", mean_absolute_error(y_test, pred_test))
print("MSE:", mean_squared_error(y_test, pred_test))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test))

r2: 0.5208276433611492
MAE: 0.633115899314555
MSE: 0.9496670350765701
MAPE: 92881888568.78589
