## Load Data

In [0]:
import os
import numpy as np

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    average_precision_score,
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support
)

from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [0]:

# Inicializa ou reutiliza sessão Spark
spark = SparkSession.builder.appName("ifood-case").getOrCreate()
spark

In [0]:
catalog = 'workspace'
schema = 'default'
volume_name = 'ifood'
user_id = spark.sql('select current_user() as user').collect()[0]['user']
catalog_volume_path = f"/Volumes/{catalog}/{schema}/{volume_name}/"

In [0]:
processed_path = f"{catalog_volume_path}/data/processed"#final_dataset.parquet"

In [0]:
import pandas as pd
# Initialize df to an empty DataFrame to avoid NameError
df = pd.DataFrame()

if not os.path.exists(processed_path):
    raise FileNotFoundError(
        f"Diretório {processed_path} não encontrado. "
        "Execute o Notebook 1 primeiro para gerar os dados processados."
    )

dataset_path = os.path.join(processed_path, "final_dataset.parquet")

dataset_dir_exists = os.path.exists(dataset_path) and os.path.isdir(dataset_path)

if dataset_dir_exists:
    try:
        df_spark = spark.read.parquet(dataset_path)
        row_count = df_spark.count()
        print(f"Dataset carregado: {row_count} linhas")
        df = df_spark.toPandas()
    except Exception as e:
        print(f"Erro ao carregar dataset completo: {e}")
        print("Tentando carregar amostra...")
        

print(f"\nShape: {df.shape}")
df.head()

In [0]:
df.columns

## Target Distribution

In [0]:
print("Target Distribution:")
print(df["converged"].value_counts())
print(f"\nConversion rate: {df['converged'].mean():.2%}")

## Preprocess

In [0]:
df.columns

In [0]:

id_columns = df[["account_id", "offer_id"]].reset_index(drop=True)

cols_drop = [
    "account_id",
    "offer_id",
    "received",
    "viewed",
    "completed",
    "count_completed",
    "age_group",  #we will use age_bin (proved better in previous correlations)
    "offer_type", # we will use encoding
    "gender", # we will use encoding
    "registered_on",  
    "date_registered_on",
    "year_month",	
    "bucket_id",
    "channels", # we will use encoding
]

cols_drop = [c for c in cols_drop if c in df.columns]
df_model = df.drop(columns=cols_drop, errors="ignore")

df_model = df_model.reset_index(drop=True)
id_columns = id_columns.loc[df_model.index]

categorical_cols = ["offer_type", "gender", "offer_type",  "age_bin", "age_group",]
categorical_cols = [c for c in categorical_cols if c in df_model.columns]

# Encoding
df_model = pd.get_dummies(df_model, columns=["age_bin"], drop_first=True)

print(f"Null Values (Before Fill): {df_model.isnull().sum().sum()}")
df_model = df_model.fillna(0)

print(f"\nShape: {df_model.shape}")
print(f"Columns {len(df_model.columns)} colunas")
df_model.head()

## Split Data

In [0]:
X = df_model.drop("converged", axis=1)
y = df_model["converged"]

print(f"X: {X.shape}")
print(f"y: {y.shape}")
print(f"converged = True in y: {y.mean():.2%}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

print(f"X_train: {X_train.shape[0]} linhas")
print(f"X_test: {X_test.shape[0]} linhas")
print(f"Converged = True in y_train: {y_train.mean():.2%}")
print(f"Converged = True in y_test: {y_test.mean():.2%}")

## Train Models

Testing multiple models

In [0]:
def _get_score_vector(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    return model.predict(X)


def evaluate_model(model_name, estimator, X_train, y_train, X_test, y_test):
    fitted_model = clone(estimator)
    fitted_model.fit(X_train, y_train)

    y_pred = fitted_model.predict(X_test)
    y_scores = _get_score_vector(fitted_model, X_test)

    roc_auc = roc_auc_score(y_test, y_scores)
    pr_auc = average_precision_score(y_test, y_scores)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    metrics = {
        "model": model_name,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": acc
    }

    return metrics, report, cm, fitted_model

model_grid = {
    "Logistic Regression": LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=14,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ),
        
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss', n_jobs=-1),
    "CatBoost": CatBoostClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

model_results = []
trained_models = {}
classification_reports = {}
confusion_matrices = {}



In [0]:
for name, estimator in model_grid.items():
    print("=" * 70)
    print(f"Model: {name}")
    metrics, report, cm, fitted = evaluate_model(
        name, estimator, X_train, y_train, X_test, y_test
    )
    model_results.append(metrics)
    trained_models[name] = fitted
    classification_reports[name] = report
    confusion_matrices[name] = cm

    print(f"ROC AUC: {metrics['roc_auc']:.4f}")
    print(f"PR AUC: {metrics['pr_auc']:.4f}")
    print("\nClassification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)

In [0]:
models_metrics = (
    pd.DataFrame(model_results)
    .sort_values(by="roc_auc", ascending=False)
    .reset_index(drop=True)
)

models_metrics

## Analyse Results

## ML Explanation (SHAP)

## Next Steps:
- Hyperparameters tunning
- Cross Validation
- Create pipeline
- A/B Test to validate de strategy
- Listen to concerns e strategies from steakholders (adapt strategy).
- More features analysis and explainability.
- API to provide explanations for each score/recommendation.

