In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

from statsmodels.genmod.generalized_linear_model import GLM
import statsmodels.api as sm
from scipy import stats

import warnings

set_config(display='text')
warnings.filterwarnings('ignore')

sns.set()
pd.set_option('display.precision', 3)

In [13]:
# Leemos del csv
df_train = pd.read_csv("../processed_train.csv", header= 0, delimiter = ',')
df_test = pd.read_csv("../processed_test.csv",  header= 0, delimiter = ',')

## Escalado

In [14]:
scaler = MinMaxScaler()

X_train = df_train.drop(columns=['close_price'])
y_train = df_train['close_price']

X_test = df_test.drop(columns=['close_price'])

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

### Regresion lineal

In [15]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [16]:
train_predictions = model.predict(X_train_scaled)

mse = mean_squared_error(y_train, train_predictions)
print(f"Mean Squared Error (MSE): {mse}")

# Calcular el Mean Absolute Error (MAE)
mae = mean_absolute_error(y_train, train_predictions)
print(f"Mean Absolute Error (MAE): {mae}")

r2 = model.score(X_train_scaled, y_train)
print(f"R^2: {r2}")

#test_predictions = model.predict(X_test_scaled)

Mean Squared Error (MSE): 83691295157.84373
Mean Absolute Error (MAE): 135235.61148112785
R^2: 0.44003943770455556


## Random Forest

In [24]:
rf_model = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid_search_rf.fit(X_train_scaled, y_train)

best_params_rf = grid_search_rf.best_params_
print(f"Mejores hiperparámetros para Random Forest: {best_params_rf}")

best_rf_model = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

train_predictions_rf_best = best_rf_model.predict(X_train_scaled)

mse_train_rf_best = mean_squared_error(y_train, train_predictions_rf_best)
mae_train_rf_best = mean_absolute_error(y_train, train_predictions_rf_best)
r2_train_rf_best = r2_score(y_train, train_predictions_rf_best)

print(f"Random Forest (mejor modelo) - Training MSE: {mse_train_rf_best}")
print(f"Random Forest (mejor modelo) - Training MAE: {mae_train_rf_best}")
print(f"Random Forest (mejor modelo) - Training R^2: {r2_train_rf_best}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

## Gradient Boosting

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

# Crear y entrenar el modelo de Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)

# Realizar predicciones en el conjunto de prueba
train_predictions_gb = gb_model.predict(X_train_scaled)

# Evaluar en entrenamiento
mse_train_gb = mean_squared_error(y_train, train_predictions_gb)
mae_train_gb = mean_absolute_error(y_train, train_predictions_gb)
r2_train_gb = r2_score(y_train, train_predictions_gb)

print(f"Gradient Boosting - Training MSE: {mse_train_gb}")
print(f"Gradient Boosting - Training MAE: {mae_train_gb}")
print(f"Gradient Boosting - Training R^2: {r2_train_gb}")

Gradient Boosting - Training MSE: 56903182163.84643
Gradient Boosting - Training MAE: 97424.60426022355
Gradient Boosting - Training R^2: 0.6192729743186293


## SVM Regressor

## XGBoost

In [22]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

xgb_model = xgb.XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 300, 400, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [4, 6, 8, 10]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
print(f"Mejores hiperparámetros: {best_params}")

best_xgb_model = xgb.XGBRegressor(**best_params, random_state=42)

best_xgb_model.fit(X_train_scaled, y_train)

train_predictions_xgb = best_xgb_model.predict(X_train_scaled)

mse_train_xgb = mean_squared_error(y_train, train_predictions_xgb)
mae_train_xgb = mean_absolute_error(y_train, train_predictions_xgb)
r2_train_xgb = r2_score(y_train, train_predictions_xgb)

print(f"XGBoost - Training MSE: {mse_train_xgb}")
print(f"XGBoost - Training MAE: {mae_train_xgb}")
print(f"XGBoost - Training R^2: {r2_train_xgb}")

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Mejores hiperparámetros: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
XGBoost - Training MSE: 21223137755.82739
XGBoost - Training MAE: 70925.30443725937
XGBoost - Training R^2: 0.8580005228857667
