In [1]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.api.types import is_numeric_dtype
import gc
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [2]:
# Cargar los datos
files = ["datosTP2/ctr_15.csv", "datosTP2/ctr_16.csv", "datosTP2/ctr_17.csv", "datosTP2/ctr_18.csv", "datosTP2/ctr_19.csv", "datosTP2/ctr_20.csv", "datosTP2/ctr_21.csv"]
combined_data = pd.concat([pd.read_csv(f) for f in files])

In [3]:
# Load the test data
eval_data = pd.read_csv("datosTP2/ctr_test.csv")

In [4]:
# Feature Engineering: Add 'ad_size' (creative_height * creative_width)
combined_data['ad_size'] = combined_data['creative_height'] * combined_data['creative_width']
eval_data['ad_size'] = eval_data['creative_height'] * eval_data['creative_width']

In [5]:
# Split the combined data into train and validation sets with a fixed random state
random_seed = 2345
combined_data = combined_data.sample(frac=8/10, random_state=random_seed)
y = combined_data["Label"]
X = combined_data.drop(columns=["Label"])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_seed)

del combined_data, X, y
gc.collect()
# Ahora X_train y X_val contienen tanto variables numéricas como categóricas.

7

In [6]:
# Suponiendo que ya tienes X_train, y_train, X_val y eval_data

# Identificar las variables categóricas en el conjunto de entrenamiento
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Codificación por conteo
for feature in categorical_features:
    counts = X_train[feature].value_counts()
    X_train[feature + '_count'] = X_train[feature].map(counts)
    X_val[feature + '_count'] = X_val[feature].map(counts)

# Eliminar las variables categóricas originales si es necesario
X_train_final = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_val_final = X_val.drop(columns=categorical_features).reset_index(drop=True)


In [7]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV

# Definir el pipeline con PCA y Random Forest
pipeline = make_pipeline(
    SimpleImputer(),       # Para manejar valores faltantes
    PCA(n_components=10),  # Seleccionamos 10 componentes principales (ajusta este valor según sea necesario)
    RandomForestClassifier(random_state=42, n_jobs=-1)  # Modelo Random Forest
)

# Hiperparámetros para RandomizedSearchCV
param_distributions = {
    'randomforestclassifier__n_estimators': [50, 100],  # Número de árboles
    'randomforestclassifier__max_depth': [4, 6, 8],  # Profundidad del árbol
    'randomforestclassifier__min_samples_split': [2, 5],
    'randomforestclassifier__min_samples_leaf': [1, 2],
}

# RandomizedSearchCV para encontrar los mejores hiperparámetros
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, cv=3, verbose=1, n_jobs=-1, scoring='roc_auc', random_state=42)


In [8]:
# Entrenar el modelo con los datos de entrenamiento
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits




In [None]:
# Evaluar el modelo en el conjunto de validación
y_val_preds = random_search.predict_proba(X_val)[:, 1]
val_auc_roc = roc_auc_score(y_val, y_val_preds)
print(f'Validation AUC-ROC: {val_auc_roc:.4f}')

In [None]:
# Predecir en el conjunto de evaluación
eval_data_pca = eval_data.select_dtypes(include='number')
y_preds = random_search.predict_proba(eval_data_pca.drop(columns=["id"]))[:, 1]

In [None]:
# Hacer el archivo de submission
submission_df = pd.DataFrame({"id": eval_data["id"], "Label": y_preds})
submission_df["id"] = submission_df["id"].astype(int)
submission_df.to_csv("random_forest_pca_submission.csv", sep=",", index=False)