# **1. Configuración del Ambiente**


---

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from scipy.stats import randint
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
global df_traffic, resultados, modelo, modelo_clasificacion

# **2. Creación de Modelo de Gradient Boosted Regressor**


---

In [2]:
df_traffic = pd.read_csv('Classifier.csv', sep=';')
df_traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12283 entries, 0 to 12282
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   visitNumber         12283 non-null  int64  
 1   browser             12283 non-null  int64  
 2   operatingSystem     12283 non-null  int64  
 3   deviceCategory      12283 non-null  int64  
 4   continent           12283 non-null  int64  
 5   country             12283 non-null  int64  
 6   metro               12283 non-null  int64  
 7   city                12283 non-null  int64  
 8   networkDomain       12283 non-null  int64  
 9   campaign            12283 non-null  int64  
 10  source              12283 non-null  int64  
 11  medium              12283 non-null  int64  
 12  keyword             12283 non-null  int64  
 13  referralPath        12283 non-null  int64  
 14  adContent           12283 non-null  int64  
 15  pageviews           12283 non-null  int64  
 16  boun

In [3]:
# Dividir los datos en conjunto de entrenamiento y prueba
X = df_traffic.drop(columns=['transactionRevenue'])
y = df_traffic['transactionRevenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creación del modelo Gradient Boosted Trees
gbt_model = GradientBoostingRegressor()
gbt_model.fit(X_train, y_train)

y_pred = gbt_model.predict(X_test)

# Evaluación del modelo
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 259.7851527729629


In [4]:
#definiendo el K - número de subconjuntos
cv = KFold(n_splits = 5, shuffle=True)

In [5]:
accuracy_cross_val = cross_val_score(estimator = gbt_model,
                                       X = X_train,
                                       y = y_train,
                                       scoring = 'neg_mean_squared_error',
                                       cv = cv)
print('Accuracy promedio de mse: {}'.format(round(accuracy_cross_val.mean(), 3)))

Accuracy promedio de mse: -410.279


In [6]:
# Selección de hiperparámetros para la regresión lineal
param_gbt = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_depth': [3, 5],
    'min_impurity_decrease': [0.0, 0.1],
    'max_features': ['sqrt', None],
    'alpha': [0.1, 0.5],
    'max_leaf_nodes': [None, 10],
    'ccp_alpha': [0.0, 0.1]
}

In [7]:
# Configurar RandomizedSearchCV
randomized_search_gbt = RandomizedSearchCV(
    estimator=gbt_model,
    param_distributions=param_gbt,
    n_iter=10,  
    cv=5, 
    scoring='neg_mean_squared_error',  
    n_jobs=-1, 
    random_state=42  
)


In [8]:
# Ajustar el modelo
randomized_search_gbt.fit(X_train, y_train)

In [9]:
# Obtener los mejores parámetros y la mejor puntuación
best_params_gbt = randomized_search_gbt.best_params_
best_score_gbt = randomized_search_gbt.best_score_

print("Mejores parámetros:", best_params_gbt)
print("Mejor puntuación:", best_score_gbt)

Mejores parámetros: {'subsample': 1.0, 'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 5, 'loss': 'absolute_error', 'learning_rate': 0.05, 'ccp_alpha': 0.0, 'alpha': 0.5}
Mejor puntuación: -336.80710650363505


In [10]:
# Crear una nueva instancia de GradientBoostingRegressor con los mejores parámetros encontrados
best_gbt_model = GradientBoostingRegressor(**randomized_search_gbt.best_params_)

# Entrenar el modelo con los datos de entrenamiento
best_gbt_model.fit(X_train, y_train)

# Predecir con el modelo entrenado
y_pred = best_gbt_model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calcular R^2
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# Calcular RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Mean Squared Error: 265.89374670992214
R^2 Score: 0.3807742329394471
Root Mean Squared Error: 16.306248701339072
