En este notebook vamos a crear el modelo eliminando la variable *date*, ya que no tenía mucha correlación a ver si mejora o no la precisión

## Librerias y carga de datos

In [25]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [5]:
train = pd.read_csv("..\\data\\processed\\train_nodate.csv", index_col=0)

## X e Y

In [8]:
X = train.drop(columns="forks")
y = train["forks"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## BaseLines

In [9]:
modelos = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor()
}

# Define las métricas a usar
metricas = "r2"

resultados_dict = {}

for modelo in modelos:

    resultados_dict[modelo] = [cross_val_score(modelos[modelo], X_train, y_train, scoring="r2", cv=3).mean()]

pd.DataFrame(resultados_dict).T

Unnamed: 0,0
LinearRegression,0.612144
ElasticNet,0.484654
Ridge,0.612144
Lasso,0.363991
DecisionTree,0.99871
RandomForest,0.999371
AdaBoost,0.768876
GradientBoost,0.941669
HistGradientBoosting,0.992149


In [10]:
VotingR = VotingRegressor(estimators=
                          [
                              ("dt", DecisionTreeRegressor(random_state=42)),
                              ("hgb", HistGradientBoostingRegressor(random_state=42))
                          ])

In [11]:
VotingR.fit(X_train, y_train)

In [12]:
for name, clf in VotingR.named_estimators_.items():
    print(name, "=", clf.score(X_val, y_val))

dt = 0.9986991155728753
hgb = 0.9918523171598714


Elegimos ``DecisionTreeRegressor``.

In [15]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train, y_train)

In [16]:
grid_search.best_params_

{'max_depth': 5, 'max_features': 7, 'min_samples_leaf': 2, 'splitter': 'best'}

## Entrenamiento

In [17]:
dtr = DecisionTreeRegressor(max_depth=5, max_features=7, min_samples_leaf=2, splitter='best', random_state=42)
dtr.fit(X_train, y_train)

## Predicciones y métricas

In [18]:
pred = dtr.predict(X_val)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.8627739600765343
MAE: 0.3662812569247608
MSE: 0.2712492997607479
MAPE: 65351495095.39893


## Validación

In [19]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))

train 0.8700157052501052
val 0.8627739600765343


## Test

In [22]:
test = pd.read_csv("..\\data\\processed\\test_nodate.csv", index_col=0)
X_test = test.drop(columns="forks")
y_test = test["forks"]

In [23]:
pred_test = dtr.predict(X_test)
print("r2:", r2_score(y_test, pred_test))
print("MAE:", mean_absolute_error(y_test, pred_test))
print("MSE:", mean_squared_error(y_test, pred_test))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test))

r2: 0.8631349643991343
MAE: 0.3658595082676021
MSE: 0.27125148344624905
MAPE: 127554200078.19661


In [24]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))
print('test',r2_score(y_test, pred_test))

train 0.8700157052501052
val 0.8627739600765343
test 0.8631349643991343


## PCA

In [26]:
param_grid = {
    'n_components': np.arange(2,13)
}

pca = PCA(random_state=42)

grid_search = GridSearchCV(pca,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train)

In [27]:
grid_search.best_params_

{'n_components': 2}

In [28]:
pca = PCA(n_components=2)
pca.fit(X)
X_train_pca = pca.transform(X_train)

In [31]:
X_val_pca = pca.transform(X_val)

In [29]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train_pca, y_train)

In [30]:
grid_search.best_params_

{'max_depth': 5, 'max_features': 2, 'min_samples_leaf': 2, 'splitter': 'best'}

In [32]:
dtr = DecisionTreeRegressor(max_depth=5, max_features=2, min_samples_leaf=2, splitter='best', random_state=42)
dtr.fit(X_train_pca, y_train)

In [33]:
pred_pca = dtr.predict(X_val_pca)
print("r2:", r2_score(y_val, pred_pca))
print("MAE:", mean_absolute_error(y_val, pred_pca))
print("MSE:", mean_squared_error(y_val, pred_pca))
print("MAPE:", mean_absolute_percentage_error(y_val, pred_pca))

r2: 0.5076670709299644
MAE: 0.6511118817770526
MSE: 0.9731750791168091
MAPE: 40442960195.602806


In [34]:
X_test_pca = pca.transform(X_test)
pred_test_pca = dtr.predict(X_test_pca)
print("r2:", r2_score(y_test, pred_test_pca))
print("MAE:", mean_absolute_error(y_test, pred_test_pca))
print("MSE:", mean_squared_error(y_test, pred_test_pca))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test_pca))

r2: 0.4869673361531245
MAE: 0.6548469558046801
MSE: 1.0167744487399637
MAPE: 103257936059.29424
