In [11]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse import csr_matrix  
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from pandas.api.types import is_numeric_dtype
import gc
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterSampler
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


In [12]:
# Cargar los datos
files = ["datosTP2/ctr_15.csv", "datosTP2/ctr_16.csv", "datosTP2/ctr_17.csv", "datosTP2/ctr_18.csv", "datosTP2/ctr_19.csv", "datosTP2/ctr_20.csv", "datosTP2/ctr_21.csv"]
combined_data = pd.concat([pd.read_csv(f) for f in files])
# Load the test data
eval_data = pd.read_csv("datosTP2/ctr_test.csv")

Eliminar algunos atributos

In [13]:
#Eliminar las columnas categóricas ruidosas (modifica esta lista según sea necesario)
columns_to_drop = [ 'auction_categorical_6','action_categorical_7', 'auction_boolean_1', 'auction_boolean_2', 'auction_categorical_10', 'auction_categorical_2',  'auction_categorical_1','auction_categorical_3', 'auction_categorical_0', 'creative_categorical_10']  # Especifica las columnas ruidosas que quieras eliminar

combined_data = combined_data.drop(columns=columns_to_drop)

# Eliminar columnas ruidosas del conjunto de test
eval_data = eval_data.drop(columns=columns_to_drop)

Balancear datos (reducción clase mayoritaria)

In [14]:
label_counts = combined_data["Label"].value_counts()
print(label_counts)
min_count = label_counts.min()

balanced_data = pd.DataFrame()

# Iterar sobre cada clase y muestrear
for label in label_counts.index:
    label_data = combined_data[combined_data["Label"] == label]  # Filtrar por clase
    sampled_data = label_data.sample(n=min_count, random_state=2345)  # Muestrear
    balanced_data = pd.concat([balanced_data, sampled_data])  # Concatenar al DataFrame balanceado

# Resetear el índice
balanced_data.reset_index(drop=True, inplace=True)

Label
0    8539062
1     105530
Name: count, dtype: int64


Ingenieria: aplicar Log

In [15]:

# Aplicar transformación logarítmica a algunas variables numéricas
# Especifica las columnas a transformar (modifícalo según tus datos)
columns_to_log_transform = ['auction_bidfloor', 'creative_height', 'creative_width']
'''
# Asegúrate de que no haya valores cero o negativos en las columnas que deseas transformar
for col in columns_to_log_transform:
    combined_data[col] = np.log1p(combined_data[col])
'''

for col in columns_to_log_transform:
    combined_data[col] = np.log1p(combined_data[col])  # np.log1p(x) = log(1 + x)

# Hacer lo mismo para el conjunto de test?

In [16]:
# Feature Engineering 1: Add 'ad_size' (creative_height * creative_width)
combined_data['ad_size'] = combined_data['creative_height'] * combined_data['creative_width']
eval_data['ad_size'] = eval_data['creative_height'] * eval_data['creative_width']

Holdset

In [17]:
# Split the combined data into train and validation sets with a fixed random state
random_seed = 2345
combined_data = balanced_data.sample(frac=8/10, random_state=random_seed)
y = combined_data["Label"]
X = combined_data.drop(columns=["Label"])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_seed)

del combined_data, X, y
gc.collect()
# Ahora X_train y X_val contienen tanto variables numéricas como categóricas.

0

Cambio formato de fecha

In [18]:
# Definir columnas a eliminar después
columns_to_drop = ['auction_time']
# Convertir 'auction_time' de formato UNIX a datetime y crear columnas de fecha y hora

# Para el set de entrenamiento (X_train)
X_train['auction_time'] = pd.to_datetime(X_train['auction_time'], unit='s')
X_train['fecha'] = X_train['auction_time'].dt.date
X_train['hora'] = X_train['auction_time'].dt.time

# Para el set de validación (X_val)
X_val['auction_time'] = pd.to_datetime(X_val['auction_time'], unit='s')
X_val['fecha'] = X_val['auction_time'].dt.date
X_val['hora'] = X_val['auction_time'].dt.time

# Para el set de evaluación (eval_data)
eval_data['auction_time'] = pd.to_datetime(eval_data['auction_time'], unit='s')
eval_data['fecha'] = eval_data['auction_time'].dt.date
eval_data['hora'] = eval_data['auction_time'].dt.time

# Eliminar solo la columna 'auction_time' porque ya no es necesaria
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_val = X_val.drop(columns=columns_to_drop, errors='ignore')
eval_data = eval_data.drop(columns=columns_to_drop, errors='ignore')

Si se corre este no correr One-hot encoding, cambiar donde diga encoded por numeric 

In [19]:
'''
X_train_numeric = X_train.select_dtypes(include='number')
X_val_numeric = X_val.select_dtypes(include='number')
'''

"\nX_train_numeric = X_train.select_dtypes(include='number')\nX_val_numeric = X_val.select_dtypes(include='number')\n"

Ingenieria: One-Hot encoding

In [20]:
# Identificar las variables categóricas (si no lo tienes ya hecho)
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Crear el codificador OneHotEncoder con salida dispersa
one_hot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

# Crear el ColumnTransformer para aplicar OneHotEncoder a las variables categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ], remainder='passthrough'  # Mantener las variables numéricas tal cual
)

# Aplicar el One-Hot Encoding a X_train y X_val, en formato disperso
X_train_encoded = preprocessor.fit_transform(X_train)
X_val_encoded = preprocessor.transform(X_val)

# Convertir la salida dispersa en matrices dispersas
X_train_encoded = sparse.csr_matrix(X_train_encoded)
X_val_encoded = sparse.csr_matrix(X_val_encoded)

Random Forest

In [21]:
''' 
# Train the model using a Decision Tree
cls = make_pipeline(SimpleImputer(), DecisionTreeClassifier(random_state=random_seed))

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'decisiontreeclassifier__max_depth': randint(7, 15),
    'decisiontreeclassifier__min_samples_split': randint(12, 22),
    'decisiontreeclassifier__min_samples_leaf': randint(7, 22)
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(cls, param_distributions=param_dist, 
                                   n_iter=14, cv=3, 
                                   scoring='roc_auc', random_state=random_seed, 
                                   verbose=1, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters found
print(f'Best parameters: {random_search.best_params_}')
print(f'Best AUC-ROC: {random_search.best_score_:.4f}')

'''

" \n# Train the model using a Decision Tree\ncls = make_pipeline(SimpleImputer(), DecisionTreeClassifier(random_state=random_seed))\n\n# Define the parameter grid for RandomizedSearchCV\nparam_dist = {\n    'decisiontreeclassifier__max_depth': randint(7, 15),\n    'decisiontreeclassifier__min_samples_split': randint(12, 22),\n    'decisiontreeclassifier__min_samples_leaf': randint(7, 22)\n}\n\n# Initialize the RandomizedSearchCV\nrandom_search = RandomizedSearchCV(cls, param_distributions=param_dist, \n                                   n_iter=14, cv=3, \n                                   scoring='roc_auc', random_state=random_seed, \n                                   verbose=1, n_jobs=-1)\n\n# Fit RandomizedSearchCV\nrandom_search.fit(X_train, y_train)\n\n# Best parameters found\nprint(f'Best parameters: {random_search.best_params_}')\nprint(f'Best AUC-ROC: {random_search.best_score_:.4f}')\n\n"

XG-Boost

In [22]:
import xgboost as xgb
# Entrenamiento y evaluación del modelo XGBoost
xgb_params = {
    'colsample_bytree': 0.75,
    'gamma': 0.5,
    'learning_rate': 0.075,
    'max_depth': 8,
    'min_child_weight': 1,
    'n_estimators': 1000,
    'reg_lambda': 0.5,
    'subsample': 0.75,
}

clf_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    seed=1234,
    eval_metric='auc',
    **xgb_params
)

# Entrenamiento del modelo
clf_xgb.fit(X_train_encoded, y_train, verbose=True, eval_set=[(X_val_encoded, y_val)])

[0]	validation_0-auc:0.82001
[1]	validation_0-auc:0.83763
[2]	validation_0-auc:0.83896
[3]	validation_0-auc:0.84223
[4]	validation_0-auc:0.84207
[5]	validation_0-auc:0.84251
[6]	validation_0-auc:0.84509
[7]	validation_0-auc:0.84561
[8]	validation_0-auc:0.84616
[9]	validation_0-auc:0.84667
[10]	validation_0-auc:0.84717
[11]	validation_0-auc:0.84796
[12]	validation_0-auc:0.84845
[13]	validation_0-auc:0.84892
[14]	validation_0-auc:0.84879
[15]	validation_0-auc:0.84943
[16]	validation_0-auc:0.84960
[17]	validation_0-auc:0.85005
[18]	validation_0-auc:0.85008
[19]	validation_0-auc:0.85053
[20]	validation_0-auc:0.85096
[21]	validation_0-auc:0.85101
[22]	validation_0-auc:0.85123
[23]	validation_0-auc:0.85123
[24]	validation_0-auc:0.85179
[25]	validation_0-auc:0.85188
[26]	validation_0-auc:0.85201
[27]	validation_0-auc:0.85220
[28]	validation_0-auc:0.85258
[29]	validation_0-auc:0.85280
[30]	validation_0-auc:0.85344
[31]	validation_0-auc:0.85382
[32]	validation_0-auc:0.85413
[33]	validation_0-au

KeyboardInterrupt: 

Random Search (xgboost)

In [None]:
''' 

params = {'max_depth': list(range(5, 10)),
          'learning_rate': uniform(scale = 0.2),
          'gamma': uniform(scale = 2),
          'reg_lambda': uniform(scale = 5),        # Parámetro de regularización.
          'subsample': uniform(0.5, 0.5),          # Entre 0.5 y 1.
          'min_child_weight': uniform(scale = 5),
          'colsample_bytree': uniform(0.75, 0.25), # Entre 0.75 y 1.
          'n_estimators': list(range(300, 800))
         }

best_score = 0
best_estimator = None
iterations = 10
i = 0
for g in ParameterSampler(params, n_iter = iterations, random_state = 2345):
    print(g)
    clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic', seed = 2345, eval_metric = 'auc', **g)
    clf_xgb.fit(X_train_encoded, y_train, eval_set = [(X_val_encoded, y_val)], verbose = 100)

    y_pred = clf_xgb.predict_proba(X_val_encoded)[:, 1] # Obtenemos la probabilidad de una de las clases (cualquiera).
    auc_roc = sklearn.metrics.roc_auc_score(y_val, y_pred)
    # Guardamos si es mejor.
    if auc_roc > best_score:
        print(f'Mejor valor de ROC-AUC encontrado: {auc_roc}')
        best_score = auc_roc
        best_grid = g
        best_estimator = clf_xgb


print('ROC-AUC: %0.5f' % best_score)
print('Grilla:', best_grid)

'''

Test

In [12]:
# Asegúrate de que eval_data contenga tanto variables categóricas como numéricas
# Filtrar las columnas categóricas en el conjunto de test
eval_data_categorical = eval_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Si en el conjunto de test faltan columnas categóricas que estaban en el entrenamiento, deberías añadirlas con valores NaN
missing_cols = set(categorical_features) - set(eval_data_categorical)
for col in missing_cols:
    eval_data[col] = np.nan  # Añadir la columna faltante

# Ahora aplicar la misma transformación que en el conjunto de entrenamiento
eval_data_encoded = preprocessor.transform(eval_data)  # Usar preprocessor.transform, no fit_transform

# Convertir a formato disperso si es necesario  
eval_data_encoded = sparse.csr_matrix(eval_data_encoded)

# Hacer predicciones con el modelo
y_preds_eval = clf_xgb.predict_proba(eval_data_encoded)[:, 1]

In [13]:
# Crear el archivo de envío
submission_df = pd.DataFrame({"id": eval_data["id"], "Label": y_preds_eval})
submission_df["id"] = submission_df["id"].astype(int)  # Asegúrate de que "id" sea entero
submission_df.to_csv("basic_model2.csv", sep=",", index=False)  # Guarda el archivo sin el índice