In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

In [3]:
# Cargar el DataFrame desde el archivo
with open('../Pickles/df_final.pickle', 'rb') as archivo:
    df = pickle.load(archivo)

In [4]:
nombres=df.drop(['CompTotal'], axis=1).columns

In [5]:
df.isnull().values.any()

False

In [6]:
limite_inferior = 17599.36
limite_superior = 120000

df_filtrado = df[(df['CompTotal'] >= limite_inferior) & (df['CompTotal'] <= limite_superior)]

print(f"Tamaño del DataFrame original: {df.shape}")
print(f"Tamaño del DataFrame filtrado: {df_filtrado.shape}")

Tamaño del DataFrame original: (1934, 397)
Tamaño del DataFrame filtrado: (1802, 397)


In [7]:
df = df_filtrado

In [8]:
df['CompTotal'] = np.log1p(df['CompTotal'])

In [9]:
y = df['CompTotal']

In [10]:
df = df.drop(columns=['CompTotal'], axis=1)

In [11]:
columnas = df.columns

In [None]:
# Crear características polinómicas de segundo grado
poly = PolynomialFeatures(degree=2, include_bias=False)  # include_bias=False excluye la columna de unos
X_poly = poly.fit_transform(df)

# Ver las nuevas características
print("Nombres de las características:", poly.get_feature_names_out(input_features=columnas))
print(pd.DataFrame(X_poly, columns=poly.get_feature_names_out()))

# --- 1. Escalar los datos antes de PCA ---
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_poly)

# --- 2. Aplicar PCA para reducción de dimensionalidad ---
pca = PCA(n_components=0.95)  # Mantener el 95% de la varianza
X_pca = pca.fit_transform(X_scaled)

# Mostrar la varianza explicada acumulada
print("Varianza explicada acumulada con PCA:", sum(pca.explained_variance_ratio_))

# --- 3. Dividir los datos en Entrenamiento, Validación y Prueba ---
X_temp, X_test, y_temp, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

Nombres de las características: ['MainBranch' 'EdLevel' 'YearsCode' ...
 'AIToolCurrently Using_Testing code^2'
 'AIToolCurrently Using_Testing code AIToolCurrently Using_Writing code'
 'AIToolCurrently Using_Writing code^2']
      MainBranch  EdLevel  YearsCode  YearsCodePro  PurchaseInfluence  \
0            5.0      5.0        0.0          10.0                0.0   
1            5.0      3.0       15.0          15.0                0.0   
2            5.0      4.0        5.0           3.0                0.0   
3            5.0      4.0        4.0           2.0                0.0   
4            5.0      6.0       15.0          10.0                0.0   
...          ...      ...        ...           ...                ...   
1797         5.0      6.0        2.0           1.0                0.0   
1798         5.0      2.0       10.0           6.0                0.0   
1799         5.0     -1.0        4.0           3.0                1.0   
1800         5.0      4.0        9.0        

In [37]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(1080, 1142)
(361, 1142)
(361, 1142)
(1080,)
(361,)
(361,)


In [38]:
# Crear un modelo de Random Forest
rf_model = RandomForestRegressor(random_state=42)
# Entrenar el modelo
rf_model.fit(X_train, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = rf_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = rf_model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)
# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 76439157.949
RMSE: 8742.95
MAE: 6093.374
R2: 0.849
MAPE: 11.76

--- VALIDATION ---
MSE: 488893357.902
RMSE: 22110.93
MAE: 16360.758
R2: 0.175
MAPE: 33.12


In [39]:
# Obtener las importancias de características
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Seleccionar las 300 características más importantes
top_features = indices[:300]
X_train_reduced = X_train[:, top_features]
X_val_reduced = X_val[:, top_features]
X_test_reduced = X_test[:, top_features]

In [40]:
# Entrenar el modelo
rf_model.fit(X_train_reduced, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = rf_model.predict(X_train_reduced)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = rf_model.predict(X_val_reduced)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)
# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 72645306.841
RMSE: 8523.22
MAE: 5898.008
R2: 0.857
MAPE: 11.40

--- VALIDATION ---
MSE: 480111601.859
RMSE: 21911.45
MAE: 16278.656
R2: 0.189
MAPE: 32.82


In [41]:
# Obtener las importancias de características
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Seleccionar las 300 características más importantes
top_features = indices[:500]
X_train_reduced = X_train[:, top_features]
X_val_reduced = X_val[:, top_features]
X_test_reduced = X_test[:, top_features]

In [42]:
# Entrenar el modelo
rf_model.fit(X_train_reduced, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = rf_model.predict(X_train_reduced)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = rf_model.predict(X_val_reduced)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)
# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 73947216.020
RMSE: 8599.26
MAE: 5932.412
R2: 0.854
MAPE: 11.44

--- VALIDATION ---
MSE: 458143553.489
RMSE: 21404.29
MAE: 15729.818
R2: 0.226
MAPE: 31.48


In [50]:
xgb_reg = XGBRegressor(objective='reg:squarederror', random_state=42, 
                       colsample_bytree= 1.0, learning_rate=0.01, max_depth= 5,
                       n_estimators= 300, reg_alpha= 0, reg_lambda= 1, subsample= 0.8)
# Entrenar el modelo
xgb_reg.fit(X_train, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = xgb_reg.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = xgb_reg.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)
# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 102446457.578
RMSE: 10121.58
MAE: 6927.105
R2: 0.798
MAPE: 13.53

--- VALIDATION ---
MSE: 467303235.710
RMSE: 21617.20
MAE: 16054.512
R2: 0.211
MAPE: 32.18


In [53]:
# Obtener las importancias de características
importances = xgb_reg.feature_importances_
indices = np.argsort(importances)[::-1]

# Seleccionar las 300 características más importantes
top_features = indices[:300]
X_train_reduced = X_train[:, top_features]
X_val_reduced = X_val[:, top_features]
X_test_reduced = X_test[:, top_features]

In [54]:
# Entrenar el modelo
xgb_reg.fit(X_train_reduced, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = xgb_reg.predict(X_train_reduced)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = xgb_reg.predict(X_val_reduced)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)
# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 125165564.688
RMSE: 11187.74
MAE: 7691.676
R2: 0.753
MAPE: 15.13

--- VALIDATION ---
MSE: 493129136.755
RMSE: 22206.51
MAE: 16698.421
R2: 0.167
MAPE: 33.84


In [64]:
from sklearn.ensemble import GradientBoostingRegressor
# Crear un modelo de GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(random_state=42)

# Entrenar el modelo
gbr_model.fit(X_train, y_train)

# --- 3. Hacer Predicciones en los Conjuntos de Entrenamiento y Validación ---
# Entrenamiento
y_train_pred_log = gbr_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)

# Validación
y_val_pred_log = gbr_model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
y_val_original = np.expm1(y_val)

# --- 4. Calcular Métricas en la Escala Original ---
# Entrenamiento
mse_train = mean_squared_error(y_train_original, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train_original, y_train_pred)
r2_train = r2_score(y_train_original, y_train_pred)
mape_train = np.mean(np.abs((y_train_original - y_train_pred) / y_train_original)) * 100

# Validación
mse_val = mean_squared_error(y_val_original, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val_original, y_val_pred)
r2_val = r2_score(y_val_original, y_val_pred)
mape_val = np.mean(np.abs((y_val_original - y_val_pred) / y_val_original)) * 100

# --- 5. Imprimir Resultados ---
print("--- TRAIN ---")
print(f"MSE: {mse_train:.3f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.3f}")
print(f"R2: {r2_train:.3f}")
print(f"MAPE: {mape_train:.2f}")

print("\n--- VALIDATION ---")
print(f"MSE: {mse_val:.3f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.3f}")
print(f"R2: {r2_val:.3f}")
print(f"MAPE: {mape_val:.2f}")

--- TRAIN ---
MSE: 87545740.509
RMSE: 9356.59
MAE: 6543.268
R2: 0.827
MAPE: 12.72

--- VALIDATION ---
MSE: 447404674.805
RMSE: 21151.94
MAE: 15596.251
R2: 0.245
MAPE: 31.25


In [None]:
# Obtener las importancias de características
importances = gbr_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Seleccionar las 300 características más importantes
top_features = indices[:300]
X_train_reduced = X_train[:, top_features]
X_val_reduced = X_val[:, top_features]
X_test_reduced = X_test[:, top_features]