# Modelos de Machine Learning: Benchmarks

## Intro

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import math
import altair as alt
import joblib

### Carga de Datos

In [2]:
def load_data(df_in):
    df = pd.read_csv(df_in+'.csv')
    df = df.drop("Unnamed: 0", axis=1)
    return df

# Cargar los datos
df = load_data('df_C_S_v3')
df = df[df['Client'] != 'Boxto']
#df = df[df['Client'] != 'AOV']
df = df.reset_index()

## CTR

### CTR: Elección Output y features 

In [3]:
Variable_Target = 'CTR'

# Calcular la media y la desviación estándar de la variable objetivo y
mean_y, std_y = np.mean(df[Variable_Target]), np.std(df[Variable_Target])
outlier_threshold = 3 * std_y

max_y = mean_y + (3 * std_y)
min_y = mean_y - (3 * std_y)

df = df[df[Variable_Target] <= max_y]
df = df[df[Variable_Target] >= min_y]

In [4]:
X = df.copy()

# Preparar los datos: Features
X = X[['Año','Mes', 'Objective', 'Cost', 'Country', 'Media_type', 'Traffic_source', 'Client','Format_New','Platform','Strategy','Plataforma','Campaign_Type','Ecommerce','Service_Product']]
       #'Tipo Search', 'Bench Gral CPC',       #'Bench Search CPC', 'Bench GralSch CPL', 'Bench Search CPL',       #'Bench GralSch CTR', 'Bench Search CTR', 'Bench GralSch CR',       #'Bench Search AvgCR', 'Tipo FB', 'Bench GralFB CPC', 'Bench FB CPC',
       #'Bench GralFB CPAction', 'Bench FB CPAction', 'Bench GralFB CTR',       #'Bench FB CTR', 'Bench GralFB CR', 'Bench FB AvgCR', 'Tipo YT',       #'Bench GralYT CPV', 'Bench YT CPV', 'Bench GralYT CTR', 'Bench YT CTR',       #'Bench GralYT VR', 'Bench FB AvgVR']]

X = X.reset_index()

categorical_features = ['Objective', 'Country', 'Media_type', 'Traffic_source', 'Client','Format_New','Platform','Strategy','Plataforma','Campaign_Type','Ecommerce','Service_Product']  #,'Tipo Search','Tipo FB','Tipo YT']

# Preprocesamiento de variables categóricas
X_dum = pd.get_dummies(X, columns=categorical_features)

#Elijo el valor a predecir. CPC: La hipótesis es que el valor de COST es conocido y controlable: El output en rigor será Clicks y expresaremos el resultado en CPC Cost/Clicks
y = df[Variable_Target]

In [5]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_Scaled = scaler.fit_transform(X_dum[['Año','Mes','Cost']])

pca = PCA(n_components=2)  # Aquí estamos conservando solo una componente principal
X_pca = pca.fit_transform(X_Scaled)
X_pca = pd.DataFrame(X_pca)

X_dum['X_pca_0'] = X_pca[0]
X_dum['X_pca_1'] = X_pca[1]
X['X_pca_0'] = X_pca[0]
X['X_pca_1'] = X_pca[1]

In [6]:
joblib.dump(scaler, 'scaler_model.joblib')
joblib.dump(pca, 'pca_model.joblib')

['pca_model.joblib']

In [7]:
# Separo en conjuntos de Train y de Test
X_train = X_dum

### XGBoost CTR

MSE_test = 0.028

In [22]:
import xgboost as xgb

In [23]:
xgboost = xgb.XGBRegressor(learning_rate=0.08, max_depth=6, n_estimators=40)

In [24]:
xgboost.fit(X_train.drop('index',axis=1), y)


In [25]:
# save to JSON
xgboost.save_model("model_xgboost_CTR.json")
interval_cpc = 0.074 #CL 83%

### Tuning

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [12]:
# Definir los hiperparámetros a ajustar
parameters = {'learning_rate': [0.06, 0.07, 0.08],
              'max_depth': [6, 8, 10],
              'n_estimators': [30, 40, 50, 60]}

In [13]:
# Configurar la validación cruzada
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
# Realizar la búsqueda de parámetros
grid_search = GridSearchCV(estimator=xgboost, param_grid=parameters, cv=kfold, scoring='neg_mean_squared_error', verbose = 3)
grid_result = grid_search.fit(X_train, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END learning_rate=0.06, max_depth=6, n_estimators=30;, score=-0.004 total time=   0.0s
[CV 2/5] END learning_rate=0.06, max_depth=6, n_estimators=30;, score=-0.004 total time=   0.0s
[CV 3/5] END learning_rate=0.06, max_depth=6, n_estimators=30;, score=-0.003 total time=   0.0s
[CV 4/5] END learning_rate=0.06, max_depth=6, n_estimators=30;, score=-0.003 total time=   0.0s
[CV 5/5] END learning_rate=0.06, max_depth=6, n_estimators=30;, score=-0.003 total time=   0.0s
[CV 1/5] END learning_rate=0.06, max_depth=6, n_estimators=40;, score=-0.003 total time=   0.0s
[CV 2/5] END learning_rate=0.06, max_depth=6, n_estimators=40;, score=-0.003 total time=   0.0s
[CV 3/5] END learning_rate=0.06, max_depth=6, n_estimators=40;, score=-0.003 total time=   0.0s
[CV 4/5] END learning_rate=0.06, max_depth=6, n_estimators=40;, score=-0.003 total time=   0.0s
[CV 5/5] END learning_rate=0.06, max_depth=6, n_estimators=40;, score=-0.0

In [15]:
# Imprimir los resultados
print("Mejor: %f usando %s" % (grid_result.best_score_, grid_result.best_params_))

Mejor: -0.002651 usando {'learning_rate': 0.08, 'max_depth': 6, 'n_estimators': 60}


In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X_train_xgb_prueba = X_train.copy()
X_train_xgb_prueba = X_train_xgb_prueba.reset_index()
y_train_xgb_prueba = y.copy()
y_train_xgb_prueba = y_train_xgb_prueba.reset_index()


X_train_xgb_prueba = X_train_xgb_prueba.drop('index', axis=1)
y_train_xgb_prueba = y_train_xgb_prueba.drop('index', axis=1)

In [19]:
mse_test_list_total = dict()
mse_train_list_total = dict()
for i in [20,30,40,50,60,70,80,90,100]:
    print(i)
    mse_test_list = []
    mse_train_list = []
    for train_index, test_index in kf.split(X_train_xgb_prueba):
        X_train_cv, X_test_cv = X_train_xgb_prueba.iloc[train_index], X_train_xgb_prueba.iloc[test_index]
        y_train_cv, y_test_cv = y_train_xgb_prueba.iloc[train_index], y_train_xgb_prueba.iloc[test_index]
        
        # Inicializar el modelo de clasificación
        xgboost_cv = xgb.XGBRegressor(learning_rate=0.08, max_depth=8, n_estimators=i)
    
        # Ajustar el modelo con los datos de entrenamiento
        xgboost_cv.fit(X_train_cv, y_train_cv)
    
        # Hacer predicciones en los datos de prueba
        y_pred_test_cv = xgboost_cv.predict(X_test_cv)
        y_pred_train_cv = xgboost_cv.predict(X_train_cv)
    
        # Calcular la precisión y agregarla a la lista de puntuaciones
        mse_test = mean_squared_error(y_test_cv, y_pred_test_cv)
        mse_train = mean_squared_error(y_train_cv, y_pred_train_cv)
        #print(y_test_cv)
        #print(y_pred_cv)
        mse_test_list.append(mse_test)
        mse_train_list.append(mse_train)
        
    mse_cv_test = np.mean(mse_test_list)
    mse_test_list_total[i] = mse_cv_test
    mse_cv_train = np.mean(mse_train_list)
    mse_train_list_total[i] = mse_cv_train
print(mse_cv_test)
print(mse_cv_train)

df_curve = pd.DataFrame(list(mse_test_list_total.items()), columns=['Index', 'ErrorTest'])
df_curve_2 = pd.DataFrame(list(mse_train_list_total.items()), columns=['Index', 'ErrorTrain'])
df_curve = pd.merge(df_curve,df_curve_2,on='Index',how='left')
#df_curve = pd.DataFrame(mse_test_list_total)

20
30
40
50
60
70
80
90
100
0.0029606906843125855
0.00034134897267280065


In [20]:
df_curve

Unnamed: 0,Index,ErrorTest,ErrorTrain
0,20,0.003232,0.001699
1,30,0.002861,0.001031
2,40,0.002814,0.000755
3,50,0.002828,0.000611
4,60,0.002829,0.000536
5,70,0.002876,0.000469
6,80,0.002914,0.000421
7,90,0.002939,0.000384
8,100,0.002961,0.000341


In [21]:
# Supongamos que tienes un DataFrame llamado 'df' con columnas "indice", "columna_A" y "columna_B"

# Convertir el DataFrame de pandas a un formato aceptado por Altair
df_altair = pd.melt(df_curve, id_vars=['Index'], value_vars=['ErrorTest', 'ErrorTrain'])

# Crear el gráfico de líneas divididas por color en Altair
line_chart = alt.Chart(df_altair).mark_line(point=True).encode(
    x='Index:Q',
    y='value:Q',
    color='variable:N'
).properties(
    width=600,
    height=400,
    title='Gráfico de Líneas para Columnas A y B'
)

# Mostrar el gráfico
line_chart