## Librerias y carga de datos

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.feature_selection import VarianceThreshold

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("..\\data\\processed\\train_nlp.csv", index_col=0)

## Train y val

In [3]:
X = train.drop(columns="forks")
y = train["forks"]

X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

## Baselines

In [4]:
modelos = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor()
}

# Define las métricas a usar
metricas = "r2"

resultados_dict = {}

for modelo in modelos:

    resultados_dict[modelo] = [cross_val_score(modelos[modelo], X_train, y_train, scoring="r2", cv=3).mean()]

In [5]:
pd.DataFrame(resultados_dict).T

Unnamed: 0,0
LinearRegression,0.6131
ElasticNet,0.482801
Ridge,0.6131
Lasso,0.361249
DecisionTree,0.998778
RandomForest,0.999391
AdaBoost,0.772475
GradientBoost,0.94141
HistGradientBoosting,0.99189


In [6]:
VotingR = VotingRegressor(estimators=
                          [
                              ("dt", DecisionTreeRegressor(random_state=42)),
                              ("hgb", HistGradientBoostingRegressor(random_state=42)),
                              ("rf", RandomForestRegressor(random_state=42))
                         ])

In [7]:
VotingR.fit(X_train, y_train)
for name, clf in VotingR.named_estimators_.items():
    print(name, "=", clf.score(X_val, y_val))

dt = 0.999024780903917
hgb = 0.9923784118061686
rf = 0.9996412167052942


Elegimos ``DecisionTreeRegressor``.

In [9]:
param_grid = {
    'max_depth': [0, 1, 2, 3, 4, 5],
    'min_samples_leaf': np.arange(1,10),
    'max_features': np.arange(2,12),
    'splitter':["best", "random"], 
    'min_samples_leaf': np.arange(2,5)
    

}

decicsion_tree_r = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(decicsion_tree_r,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

grid_search.fit(X_train, y_train)

dtr = grid_search.best_estimator_

## Predicciones y métricas

In [10]:
pred = dtr.predict(X_val)
print("r2:", r2_score(y_val, pred))
print("MAE:", mean_absolute_error(y_val, pred))
print("MSE:", mean_squared_error(y_val, pred))
print("MAPE:", mean_absolute_percentage_error(y_val, pred))

r2: 0.8538542074026894
MAE: 0.36993315004731153
MSE: 0.2921412604252645
MAPE: 373723609673.2408


## Validación

In [11]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))

train 0.8544354072432467
val 0.8538542074026894


## Tests

In [12]:
test = pd.read_csv("..\\data\\processed\\test_nlp.csv", index_col=0)
X_test = test.drop(columns="forks")
y_test = test["forks"]

In [13]:
pred_test = dtr.predict(X_test)
print("r2:", r2_score(y_test, pred_test))
print("MAE:", mean_absolute_error(y_test, pred_test))
print("MSE:", mean_squared_error(y_test, pred_test))
print("MAPE:", mean_absolute_percentage_error(y_test, pred_test))

r2: 0.8558114861007087
MAE: 0.3691250018293917
MSE: 0.28682487913214705
MAPE: 180220727985.71765


In [14]:
print('train',r2_score(y_train, dtr.predict(X_train)))
print('val',r2_score(y_val, pred))
print('test',r2_score(y_test, pred_test))

train 0.8544354072432467
val 0.8538542074026894
test 0.8558114861007087
