In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

## Imports

In [3]:
import pyspark
from ifood_case.feature_engineering import FeatureEngineering
from ifood_case.model_trainer import LGBMTrainer
from ifood_case.evaluator import Evaluator
from ifood_case.utils import find_optimal_threshold
import pyspark.sql.functions as F
import warnings

In [4]:
warnings.filterwarnings('ignore')

## Initialize Spark

In [5]:
spark = pyspark.sql.SparkSession.builder.appName("ifood-case").getOrCreate()

25/09/25 15:37:56 WARN Utils: Your hostname, solid resolves to a loopback address: 127.0.0.2; using 192.168.0.39 instead (on interface enp5s0)
25/09/25 15:37:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 15:37:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Variables

In [6]:
OFFER_COST = 0.50 # Custo para enviar uma oferta para alguém. Sintético e simbólico

## Load Data

In [7]:
offers = spark.read.parquet("../data/processed/offers_processed")
transactions = spark.read.parquet("../data/processed/transactions_processed")
profile = spark.read.parquet("../data/processed/profile_processed")

## Modelling

### Feature Engieneering

In [8]:
fe = FeatureEngineering(offers, transactions, profile)

In [9]:
df, numerical_columns, categorical_columns = fe.transform()

In [10]:
df.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- offer_id: string (nullable = true)
 |-- time_received: double (nullable = true)
 |-- target: integer (nullable = false)
 |-- total_spend_before: double (nullable = true)
 |-- transaction_count_before: long (nullable = true)
 |-- avg_ticket_before: double (nullable = true)
 |-- max_ticket_before: double (nullable = true)
 |-- min_ticket_before: double (nullable = true)
 |-- offers_viewed_count_before: long (nullable = true)
 |-- offers_completed_count_before: long (nullable = true)
 |-- customer_conversion_rate_before: double (nullable = true)
 |-- last_offer_viewed_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- age_group: string (nullable = true)
 |-- credit_card_limit: double (nullable = true)
 |-- month_sin: double (nullable = true)
 |-- month_cos: double (nullable = true)
 |-- dayofweek_sin: double (nullable = true)
 |-- dayofweek_cos: double (nullable = true)
 |-- offer_t

### Model Train

In [11]:
lgbm_trainer = LGBMTrainer(df, numerical_columns, categorical_columns, "target")

In [None]:
x_train, x_test, y_train, y_test = lgbm_trainer.train()

25/09/25 15:38:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

### Model Evaluate

In [None]:
evaluator = Evaluator(y_test)

In [None]:
y_pred = lgbm_trainer.predict(x_test)

In [None]:
y_pred_proba = lgbm_trainer.predict_proba(x_test)

In [None]:
metrics = evaluator.report(y_pred, y_pred_proba)

In [None]:
metrics["actual_conversion_rate_test (%)"]

In [None]:
metrics["model_conversion_rate (%)"]

In [None]:
metrics["classification_report"]

#### UP Lift

In [None]:
avg_conversion_value = x_train.loc[y_train == 1,:]['max_ticket_before'].mean()

In [None]:
results_df = find_optimal_threshold(evaluator, y_pred_proba, avg_conversion_value, OFFER_COST)