## Librerias

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

import warnings
warnings.filterwarnings("ignore")

## Carga de datos

In [3]:
train = pd.read_csv("..\\data\\processed\\train_cluster_nlp_2.csv" , index_col=0)

## Train y val

In [4]:
X = train.drop(columns="forks")
y = train["forks"]

X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

## BaseLines

In [5]:
modelos = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor()
}

# Define las métricas a usar
metricas = "r2"

resultados_dict = {}

for modelo in modelos:

    resultados_dict[modelo] = [cross_val_score(modelos[modelo], X_train, y_train, scoring="r2", cv=3).mean()]


pd.DataFrame(resultados_dict).T

Unnamed: 0,0
LinearRegression,0.778901
ElasticNet,0.124238
Ridge,0.77887
Lasso,0.11691
DecisionTree,0.970002
RandomForest,0.982549
AdaBoost,0.902504
GradientBoost,0.965129
HistGradientBoosting,0.979772


In [6]:
VotingR = VotingRegressor(estimators=
                          [
                              ("dt", DecisionTreeRegressor(random_state=42)),
                              ("hgb", HistGradientBoostingRegressor(random_state=42)),
                              ("rf", RandomForestRegressor(random_state=42))
                          ])
VotingR.fit(X_train, y_train)

for name, clf in VotingR.named_estimators_.items():
    print(name, "=", clf.score(X_val, y_val))

dt = 0.9919142253096631
hgb = 0.9893016743415407
rf = 0.9925271896255468


Elegimos ``DecisionTreeRegressor``.

In [7]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train, y_train)

dtr = grid_search.best_estimator_

## Prediciones y métricas

In [8]:
pred = dtr.predict(X_val)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.9465492035673039
MAE: 0.03132544014263084
MSE: 0.004143215611271657
MAPE: 1.5683223726895559


## Validación

In [9]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))

train 0.9429558976961069
val 0.9465492035673039


## Test

In [10]:
test = pd.read_csv("..\\data\\processed\\test_cluster_nlp_2.csv" , index_col=0)
X_test = test.drop(columns="forks")
y_test = test["forks"]

In [11]:
pred_test = dtr.predict(X_test)
print("r2:", r2_score(y_test, pred_test))
print("MAE:", mean_absolute_error(y_test, pred_test))
print("MSE:", mean_squared_error(y_test, pred_test))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test))

r2: 0.9365318535198617
MAE: 0.03511944024784474
MSE: 0.004583829371476292
MAPE: 0.12207460789835571


In [12]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))
print('test',r2_score(y_test, pred_test))

train 0.9429558976961069
val 0.9465492035673039
test 0.9365318535198617
