# Modelos de Machine Learning: Benchmarks

## Intro

## Config

In [1]:
Variable_Target = 'CPV'
version = '4'

nombre_scaler = 'scaler_model_'+Variable_Target+'_v'+version+'.joblib'
nombre_pca = 'pca_model_'+Variable_Target+'_v'+version+'.joblib'

nombre_RF = 'model_rf_'+Variable_Target+'_v'+version+'.joblib'
nombre_XGB = 'model_xgboost_'+Variable_Target+'_v'+version+'.json'
nombre_NN = 'model_NN_'+Variable_Target+'_v'+version+'.joblib'
nombre_scaler_NN = 'scaler_NN_model_'+Variable_Target+'_v'+version+'.joblib'


### Imports

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import math
import altair as alt

### Carga de Datos

In [3]:
def load_data(df_in):
    df = pd.read_csv(df_in+'.csv')
    df = df.drop("Unnamed: 0", axis=1)
    return df

# Cargar los datos
df = load_data('df_C_S_v4')
df = df[df['Client'] != 'Boxto']
#df = df[df['Client'] != 'AOV']
df = df.reset_index()
df = df[df['Format_New']=='Video']

### Elección Output y features

La regla empírica establece que:

aproximadamente el 68% de los datos caerán dentro de 1 desviación estándar de la media,

alrededor del 95% de los datos caerán dentro de 2 desviaciones estándar y

aproximadamente el 99.7% de los datos caerán dentro de 3 desviaciones estándar de la media.

In [4]:


# Calcular la media y la desviación estándar de la variable objetivo y
mean_y, std_y = np.mean(df[Variable_Target]), np.std(df[Variable_Target])
outlier_threshold = 3 * std_y

max_y = mean_y + (3 * std_y)
min_y = mean_y - (3 * std_y)

df = df[df[Variable_Target] <= max_y]
df = df[df[Variable_Target] >= min_y]
df.shape

(1582, 56)

In [5]:
X = df.copy()
X = pd.DataFrame(X, columns=['Client','Año','Mes', 'Objective', 'Cost', 'Country', 'Media_type', 'Traffic_source', 'Format_New','Platform','Strategy','Plataforma','Campaign_Type','Ecommerce','Service_Product',#])#,'Client'
                            'Bench Gral CPC','Bench Search CPC','Bench GralSch CPL', 'Bench Search CPL','Bench GralSch CTR', 'Bench Search CTR', 'Bench GralSch CR','Bench Search AvgCR',
                             'Bench GralFB CPC', 'Bench FB CPC','Bench GralFB CPAction', 'Bench FB CPAction', 'Bench GralFB CTR', 'Bench FB CTR', 'Bench GralFB CR', 'Bench FB AvgCR'
                             ,'Bench GralYT CPV', 'Bench YT CPV', 'Bench GralYT CTR', 'Bench YT CTR','Bench GralYT VR', 'Bench FB AvgVR'])



In [6]:
categorical_features = ['Client','Objective', 'Country', 'Media_type', 'Traffic_source','Format_New','Platform','Strategy','Plataforma','Campaign_Type','Ecommerce','Service_Product']  #,'Tipo Search','Tipo FB','Tipo YT'], 'Client'

# Preprocesamiento de variables categóricas
X_dum = pd.get_dummies(X, columns=categorical_features)
X_dum = X_dum.reset_index()

#Elijo el valor a predecir. CPC: La hipótesis es que el valor de COST es conocido y controlable: El output en rigor será Clicks y expresaremos el resultado en CPC Cost/Clicks
y = df[Variable_Target]

In [7]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_Scaled = scaler.fit_transform(X_dum[['Año','Mes','Cost','Bench Gral CPC','Bench Search CPC','Bench GralSch CPL', 'Bench Search CPL','Bench GralSch CTR', 'Bench Search CTR', 'Bench GralSch CR','Bench Search AvgCR', 'Bench GralFB CPC', 'Bench FB CPC',
                            'Bench GralFB CPAction', 'Bench FB CPAction', 'Bench GralFB CTR', 'Bench FB CTR', 'Bench GralFB CR', 'Bench FB AvgCR', 'Bench GralYT CPV', 'Bench YT CPV', 'Bench GralYT CTR', 'Bench YT CTR','Bench GralYT VR', 'Bench FB AvgVR']])

pca = PCA(n_components=2)  # Aquí estamos conservando solo una componente principal
X_pca = pca.fit_transform(X_Scaled)
X_pca = pd.DataFrame(X_pca)

X_dum['X_pca_0'] = X_pca[0]
X_dum['X_pca_1'] = X_pca[1]
X['X_pca_0'] = X_pca[0]
X['X_pca_1'] = X_pca[1]

In [8]:
X_pca

Unnamed: 0,0,1
0,-1.913698,-1.030905
1,-1.908563,-0.992100
2,-1.909143,-0.996482
3,-1.904896,-0.964386
4,-1.910459,-1.006433
...,...,...
1577,4.340344,2.435578
1578,4.348798,2.499472
1579,4.345298,2.473019
1580,4.337612,2.414934


In [9]:
# Separo en conjuntos de Train y de Test
#X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.2, random_state=0)
X_train = X_dum
y_train = y

## RandomForest

RMSE = 0.26

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV

In [11]:
rf = RandomForestRegressor(n_estimators=50, max_depth=14, min_samples_split=4, min_samples_leaf=2)

In [12]:
rf.fit(X_train.drop('index',axis=1), y_train)

#### VC

In [15]:
from sklearn.model_selection import KFold

In [16]:
kf_rf = KFold(n_splits=5, shuffle=True, random_state=42)
X_train_rf_prueba = X_train.copy()
X_train_rf_prueba = X_train_rf_prueba.reset_index()
y_train_rf_prueba = y_train.copy()
y_train_rf_prueba = y_train_rf_prueba.reset_index()


X_train_rf_prueba = X_train_rf_prueba.drop('index', axis=1)
y_train_rf_prueba = y_train_rf_prueba.drop('index', axis=1)

In [17]:
y_train_rf_prueba

Unnamed: 0,CPV
0,3.394762
1,0.060510
2,0.022742
3,0.009502
4,0.004582
...,...
1577,0.069231
1578,0.445683
1579,0.084549
1580,0.184157


In [18]:
mse_test_list_total = dict()
mse_train_list_total = dict()
for i in [15,20,30,40,50,70,80,100]:
    print(i)
    mse_test_list = []
    mse_train_list = []
    for train_index, test_index in kf_rf.split(X_train_rf_prueba):
        X_train_cv, X_test_cv = X_train_rf_prueba.iloc[train_index], X_train_rf_prueba.iloc[test_index]
        y_train_cv, y_test_cv = y_train_rf_prueba.iloc[train_index], y_train_rf_prueba.iloc[test_index]
        
        # Inicializar el modelo de clasificación
        rf_cv = RandomForestRegressor(n_estimators=i, max_depth=14, min_samples_split=4, min_samples_leaf=2)
    
        # Ajustar el modelo con los datos de entrenamiento
        rf_cv.fit(X_train_cv, y_train_cv)
    
        # Hacer predicciones en los datos de prueba
        y_pred_test_cv = rf_cv.predict(X_test_cv)
        y_pred_train_cv = rf_cv.predict(X_train_cv)
    
        # Calcular la precisión y agregarla a la lista de puntuaciones
        mse_test = mean_squared_error(y_test_cv, y_pred_test_cv)
        mse_train = mean_squared_error(y_train_cv, y_pred_train_cv)
        #print(y_test_cv)
        #print(y_pred_cv)
        mse_test_list.append(mse_test)
        mse_train_list.append(mse_train)
        
    mse_cv_test = np.mean(mse_test_list)
    mse_test_list_total[i] = mse_cv_test
    mse_cv_train = np.mean(mse_train_list)
    mse_train_list_total[i] = mse_cv_train
print(mse_cv_test)
print(mse_cv_train)

df_curve = pd.DataFrame(list(mse_test_list_total.items()), columns=['Index', 'ErrorTest'])
df_curve_2 = pd.DataFrame(list(mse_train_list_total.items()), columns=['Index', 'ErrorTrain'])
df_curve = pd.merge(df_curve,df_curve_2,on='Index',how='left')
#df_curve = pd.DataFrame(mse_test_list_total)

15


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


20


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


30


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


40


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


50


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


70


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


80


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


100


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


105.2760199321187
26.05564912069601


In [19]:
# Supongamos que tienes un DataFrame llamado 'df' con columnas "indice", "columna_A" y "columna_B"

# Convertir el DataFrame de pandas a un formato aceptado por Altair
df_altair = pd.melt(df_curve, id_vars=['Index'], value_vars=['ErrorTest', 'ErrorTrain'])

# Crear el gráfico de líneas divididas por color en Altair
line_chart = alt.Chart(df_altair).mark_line(point=True).encode(
    x='Index:Q',
    y='value:Q',
    color='variable:N'
).properties(
    width=600,
    height=400,
    title='Gráfico de Líneas para Columnas A y B'
)

# Mostrar el gráfico
line_chart

## XGBoost

RMSE_test = 0.24

### Modelo

In [13]:
import xgboost as xgb

In [14]:
xgboost = xgb.XGBRegressor(learning_rate=0.01, max_depth=10, n_estimators=50)

In [15]:
#xgboost.fit(X_train.drop('index',axis=1), y_train)
xgboost.fit(X_train.drop('index',axis=1), y_train)

### Validación Cruzada

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [28]:
# Definir los hiperparámetros a ajustar
parameters = {'learning_rate': [0.05, 0.075, 0.1],
              'max_depth': [10, 12, 15],
              'n_estimators': [70, 80, 100]}

In [29]:
# Configurar la validación cruzada
#kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
# Realizar la búsqueda de parámetros
#grid_search = GridSearchCV(estimator=xgboost, param_grid=parameters, cv=kfold, scoring='neg_mean_squared_error', verbose = 3)
#grid_result = grid_search.fit(X_train, y_train)

In [31]:
# Imprimir los resultados
#print("Mejor: %f usando %s" % (grid_result.best_score_, grid_result.best_params_))

In [32]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X_train_xgb_prueba = X_train.copy()
X_train_xgb_prueba = X_train_xgb_prueba.reset_index()
y_train_xgb_prueba = y_train.copy()
y_train_xgb_prueba = y_train_xgb_prueba.reset_index()


X_train_xgb_prueba = X_train_xgb_prueba.drop('index', axis=1)
y_train_xgb_prueba = y_train_xgb_prueba.drop('index', axis=1)

In [35]:
mse_test_list_total = dict()
mse_train_list_total = dict()
for i in [15,20,30,40,50,70,80,100]:
    print(i)
    mse_test_list = []
    mse_train_list = []
    for train_index, test_index in kf.split(X_train_xgb_prueba):
        X_train_cv, X_test_cv = X_train_xgb_prueba.iloc[train_index], X_train_xgb_prueba.iloc[test_index]
        y_train_cv, y_test_cv = y_train_xgb_prueba.iloc[train_index], y_train_xgb_prueba.iloc[test_index]
        
        # Inicializar el modelo de clasificación
        xgboost_cv = xgb.XGBRegressor(learning_rate=0.01, max_depth=10, n_estimators=i)
    
        # Ajustar el modelo con los datos de entrenamiento
        xgboost_cv.fit(X_train_cv, y_train_cv)
    
        # Hacer predicciones en los datos de prueba
        y_pred_test_cv = xgboost_cv.predict(X_test_cv)
        y_pred_train_cv = xgboost_cv.predict(X_train_cv)
    
        # Calcular la precisión y agregarla a la lista de puntuaciones
        mse_test = mean_squared_error(y_test_cv, y_pred_test_cv)
        mse_train = mean_squared_error(y_train_cv, y_pred_train_cv)
        #print(y_test_cv)
        #print(y_pred_cv)
        mse_test_list.append(mse_test)
        mse_train_list.append(mse_train)
        
    mse_cv_test = np.mean(mse_test_list)
    mse_test_list_total[i] = mse_cv_test
    mse_cv_train = np.mean(mse_train_list)
    mse_train_list_total[i] = mse_cv_train
print(mse_cv_test)
print(mse_cv_train)

df_curve = pd.DataFrame(list(mse_test_list_total.items()), columns=['Index', 'ErrorTest'])
df_curve_2 = pd.DataFrame(list(mse_train_list_total.items()), columns=['Index', 'ErrorTrain'])
df_curve = pd.merge(df_curve,df_curve_2,on='Index',how='left')
#df_curve = pd.DataFrame(mse_test_list_total)

15
20
30
40
50
70
80
100
110.29437898989985
33.548476268191656


In [36]:
# Supongamos que tienes un DataFrame llamado 'df' con columnas "indice", "columna_A" y "columna_B"

# Convertir el DataFrame de pandas a un formato aceptado por Altair
df_altair = pd.melt(df_curve, id_vars=['Index'], value_vars=['ErrorTest', 'ErrorTrain'])

# Crear el gráfico de líneas divididas por color en Altair
line_chart = alt.Chart(df_altair).mark_line(point=True).encode(
    x='Index:Q',
    y='value:Q',
    color='variable:N'
).properties(
    width=600,
    height=400,
    title='Gráfico de Líneas para Columnas A y B'
)

# Mostrar el gráfico
line_chart

## Redes Neuronales

RMSE: 0.24

In [16]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [17]:
X_train_NN = X_train.drop('index',axis=1)
#X_test_NN = X_test#.drop('index',axis=1)

In [18]:
scaler_NN = StandardScaler()
X_train_scaled_NN = scaler_NN.fit_transform(X_train_NN)
#X_test_scaled_NN = scaler.transform(X_test_NN)

In [21]:
# Definir el modelo de redes neuronales para regresión
model_NN = MLPRegressor(hidden_layer_sizes=(100,200,200,200,100,), activation='relu', alpha=0.01, solver='adam',random_state=42, max_iter=65,learning_rate_init=0.0005)

In [22]:
#mse_train_list = []
#mse_test_list = []
for i in range(1,70):
    print(i)

    model_NN.partial_fit(X_train_scaled_NN, y_train)
    # Hacer predicciones en el conjunto de prueba
    #y_pred_train = model.predict(X_train_scaled_NN)
    #y_pred_test = model.predict(X_test_scaled_NN)
    #mse_train = mean_squared_error(y_train, y_pred_train)
    #mse_train_list.append(mse_train)
    #mse_test = mean_squared_error(y_test, y_pred_test)
    #mse_test_list.append(mse_test)

#df_curve = pd.DataFrame()
#df_curve['train'] = mse_train_list
#df_curve['test'] = mse_test_list
#df_curve = df_curve.reset_index()


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69


## Exportar Modelos

In [23]:
import joblib
joblib.dump(scaler, nombre_scaler)  # nombre_scaler = 'scaler_model_CPC_v3.joblib'
joblib.dump(pca, nombre_pca)  # nombre_pca = 'pca_model_CPC_v3.joblib'

joblib.dump(rf, nombre_RF) # nombre_RF = 'model_rf_CPC_v3.joblib'
xgboost.save_model(nombre_XGB) # nombre_XGB = 'model_xgboost_v3.json'
joblib.dump(model_NN, nombre_NN)  # nombre_NN = 'model_NN_CPC_v3.joblib'
joblib.dump(scaler_NN, nombre_scaler_NN)  # nombre_scaler_NN = 'scaler_NN_model_CPC_v3.joblib'

['scaler_NN_model_CPV_v4.joblib']

### Pruebas

In [275]:
variables_modelo= rf.feature_names_in_

In [308]:
data = {
'Año': [2023],
'Mes': [11],
'Objective': ['Purchase'],
'Cost': [250],
'Country': ['USA'],
'Media_type': ['Search'],
'Traffic_source': ['Google'],
'Client': ['Braun'],
'Format_New': ['Video'],
'Platform': ['Google Ads'],
'Strategy': ['Consideration'],
'Plataforma': ['Google Ads'],
'Campaign_Type': ['SEARCH'],
'Ecommerce': ['Si'],
'Service_Product': ['Serv'],
'Bench Gral CPC': [4.22],
'Bench Search CPC': [6.55],
'Bench GralSch CPL':[ 53.52],
'Bench Search CPL':[ 66.02],
'Bench GralSch CTR': [6.11],
'Bench Search CTR': [4.8],
'Bench GralSch CR': [7.04],
'Bench Search AvgCR': [10.22],
'Bench GralFB CPC': [1.86],
'Bench FB CPC': [3.08],
'Bench GralFB CPAction': [18.68],
'Bench FB CPAction': [31.11],
'Bench GralFB CTR': [0.9],
'Bench FB CTR': [0.62],
'Bench GralFB CR': [9.21],
'Bench FB AvgCR': [9.96],
'Bench GralYT CPV': [0.048],
'Bench YT CPV': [0.048],
'Bench GralYT CTR': [0.65],
'Bench YT CTR': [0.33],
'Bench GralYT VR': [31.9],
'Bench FB AvgVR': [31.9]
}

In [309]:
new_data = pd.DataFrame(data)

In [310]:
# Preprocesamiento de variables categóricas
X = pd.get_dummies(new_data, columns=categorical_features)

# Asegurarte de que 'new_data_encoded' tenga las mismas columnas que se utilizaron durante el entrenamiento
for col in variables_modelo:
    if col not in X.columns:
        X[col] = False  # Agregar la columna faltante con valores predeterminados si es necesario  

X = X[variables_modelo]
X.columns = [str(i) for i in X.columns]

In [311]:
X_CPV = X.copy()
X_CPV = X_CPV[scaler_NN.feature_names_in_]

In [312]:
for col in rf.feature_names_in_:
    if col not in X_CPV.columns:
        X_CPV[col] = False  # Agregar la columna faltante con valores predeterminados si es necesario
        print(col)

In [313]:
X_Scaled = scaler.transform(X_CPV[['Año','Mes','Cost','Bench Gral CPC','Bench Search CPC', 'Bench GralSch CPL', 'Bench Search CPL','Bench GralSch CTR', 'Bench Search CTR', 'Bench GralSch CR',
                                      'Bench Search AvgCR','Bench GralFB CPC', 'Bench FB CPC','Bench GralFB CPAction', 'Bench FB CPAction', 'Bench GralFB CTR','Bench FB CTR',
                                      'Bench GralFB CR', 'Bench FB AvgCR', 'Bench GralYT CPV', 'Bench YT CPV', 'Bench GralYT CTR', 'Bench YT CTR','Bench GralYT VR', 'Bench FB AvgVR']])
X_pca = pca.transform(X_Scaled)
X_pca = pd.DataFrame(X_pca)
X_CPV['X_pca_0'] = X_pca[0]
X_CPV['X_pca_1'] = X_pca[1]

X_CPV = X_CPV[scaler_NN.feature_names_in_]
X_CPV.columns = [str(i) for i in X_CPV.columns]
X_NN_CPV = scaler_NN.transform(X_CPV)

In [314]:
def prediccion_modelo(modelo,X):
    return modelo.predict(X)

In [315]:
pred_RF_CPV = prediccion_modelo(rf,X_CPV)[0]
pred_XGB_CPV = prediccion_modelo(xgboost,X_CPV)[0]
pred_NN_CPV = prediccion_modelo(model_NN,X_NN_CPV)[0]

In [316]:
print(pred_RF_CPV)
print(pred_XGB_CPV)
print(pred_NN_CPV)

0.4800753151844797
1.1601338
17.108648407128484
