# **1. Configuración del Ambiente**


---

In [8]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option('display.max_columns', None)
from scipy.stats import randint
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
global df_traffic, resultados, modelo, modelo_clasificacion

# **2. Creación de Modelos de Regresión**


---

In [9]:
df_traffic = pd.read_csv('Classifier.csv', sep=';')
df_traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12283 entries, 0 to 12282
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   visitNumber         12283 non-null  int64  
 1   browser             12283 non-null  int64  
 2   operatingSystem     12283 non-null  int64  
 3   deviceCategory      12283 non-null  int64  
 4   continent           12283 non-null  int64  
 5   country             12283 non-null  int64  
 6   metro               12283 non-null  int64  
 7   city                12283 non-null  int64  
 8   networkDomain       12283 non-null  int64  
 9   campaign            12283 non-null  int64  
 10  source              12283 non-null  int64  
 11  medium              12283 non-null  int64  
 12  keyword             12283 non-null  int64  
 13  referralPath        12283 non-null  int64  
 14  adContent           12283 non-null  int64  
 15  pageviews           12283 non-null  int64  
 16  boun

### **2.1 Regresión Lineal**


---

In [10]:
# Dividir los datos en conjunto de entrenamiento y prueba
X = df_traffic.drop(columns=['transactionRevenue'])
y = df_traffic['transactionRevenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

# Evaluar el rendimiento del modelo
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 293.72803510804175


### **2.2 LightGBM**


---

In [11]:
# Definir los parámetros del modelo
params = {
    'objective': 'regression',       # Problema de regresión
    'metric': 'mse',                 # Métrica de evaluación: Error cuadrático medio
    'num_leaves': 31,                # Número máximo de hojas en un árbol
    'learning_rate': 0.1,            # Tasa de aprendizaje
    'max_depth': -1,                 # Profundidad máxima de cada árbol (-1 significa sin límite)
    'min_child_samples': 20,         # Número mínimo de muestras necesarias para formar una nueva partición en un nodo
    'reg_alpha': 0.0,                # Parámetro de regularización L1 (alpha)
    'reg_lambda': 0.0,               # Parámetro de regularización L2 (lambda)
    'n_estimators': 100,             # Número de árboles en el conjunto
    'bagging_fraction': 0.8,         # Fracción de muestras para construir cada árbol (subsampling)
    'feature_fraction': 0.8,         # Fracción de características para construir cada árbol (subsamplling de características)
    'bagging_freq': 5,                # Frecuencia de subsampling (se realiza cada 5 iteraciones)
    'force_col_wise': True          # Forzar el modo de entrenamiento de columnas
}


train_data = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, train_data, num_boost_round=100)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

[LightGBM] [Info] Total Bins 956
[LightGBM] [Info] Number of data points in the train set: 9826, number of used features: 23
[LightGBM] [Info] Start training from score 1.488793
Mean Squared Error: 213.44606818988592


### **2.3  Gradient-boosted trees**


---

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creación del modelo Gradient Boosted Trees
gbt_model = GradientBoostingRegressor()

# Entrenamiento del modelo
gbt_model.fit(X_train, y_train)

# Predicción en el conjunto de prueba
y_pred = gbt_model.predict(X_test)

# Evaluación del modelo
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 261.7808248716831


### **2.4 Desicion Tree**


---

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 678.9123096052095


### **2.5 Random Forest**


---

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 225.63661396709807
