In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

## Imports

In [3]:
import pyspark
from ifood_case.feature_engineering import FeatureEngineering
import pyspark.sql.functions as F

## Initialize Spark

In [4]:
spark = pyspark.sql.SparkSession.builder.appName("ifood-case").getOrCreate()

25/09/24 07:48:35 WARN Utils: Your hostname, solid resolves to a loopback address: 127.0.0.2; using 192.168.0.39 instead (on interface enp5s0)
25/09/24 07:48:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 07:48:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

In [5]:
offers = spark.read.parquet("../data/processed/offers_processed")
transactions = spark.read.parquet("../data/processed/transactions_processed")
profile = spark.read.parquet("../data/processed/profile_processed")

## Modelling

### Feature Engieneering

In [6]:
transactions.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- event: string (nullable = true)
 |-- time_since_test_start: double (nullable = true)
 |-- offer_completed: string (nullable = true)
 |-- offer_received_viewed: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- reward: double (nullable = true)



In [7]:
fe = FeatureEngineering(offers, transactions, profile)

In [8]:
df = fe.transform()

In [9]:
df.printSchema()

root
 |-- account_id: string (nullable = true)
 |-- offer_id: string (nullable = true)
 |-- time_received: double (nullable = true)
 |-- target: integer (nullable = false)
 |-- total_spend_before: double (nullable = true)
 |-- transaction_count_before: long (nullable = true)
 |-- avg_ticket_before: double (nullable = true)
 |-- max_ticket_before: double (nullable = true)
 |-- min_ticket_before: double (nullable = true)
 |-- offers_viewed_count_before: long (nullable = true)
 |-- offers_completed_count_before: long (nullable = true)
 |-- customer_conversion_rate_before: double (nullable = true)
 |-- last_offer_viewed_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- credit_card_limit: double (nullable = true)
 |-- month_sin: double (nullable = true)
 |-- month_cos: double (nullable = true)
 |-- dayofweek_sin: double (nullable = true)
 |-- dayofweek_cos: double (nullable = true)
 |-- discount_value: long (nullable = true)
 |-- dura

In [10]:
df.show(5, truncate=False)

25/09/24 07:48:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------------------+--------------------------------+-------------+------+------------------+------------------------+-----------------+-----------------+-----------------+--------------------------+-----------------------------+-------------------------------+----------------------+------+----+-----------------+-----------------------+-----------------------+-------------------+--------------------+--------------+--------+---------+-------------+-----+------+------+---+
|account_id                      |offer_id                        |time_received|target|total_spend_before|transaction_count_before|avg_ticket_before|max_ticket_before|min_ticket_before|offers_viewed_count_before|offers_completed_count_before|customer_conversion_rate_before|last_offer_viewed_type|gender|age |credit_card_limit|month_sin              |month_cos              |dayofweek_sin      |dayofweek_cos       |discount_value|duration|min_value|offer_type   |email|social|mobile|web|
+-------------------

In [11]:
offers.printSchema()

root
 |-- channels: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- discount_value: long (nullable = true)
 |-- duration: double (nullable = true)
 |-- id: string (nullable = true)
 |-- min_value: long (nullable = true)
 |-- offer_type: string (nullable = true)



In [12]:
profile.printSchema()

root
 |-- age: long (nullable = true)
 |-- credit_card_limit: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: string (nullable = true)
 |-- registered_on: string (nullable = true)
 |-- age_group: string (nullable = true)



In [13]:
profile.show(truncate=False)

+---+-----------------+------+--------------------------------+-------------+---------+
|age|credit_card_limit|gender|id                              |registered_on|age_group|
+---+-----------------+------+--------------------------------+-------------+---------+
|118|NULL             |NULL  |68be06ca386d4c31939f3a4f0e3dd783|20170212     |51+      |
|55 |112000.0         |F     |0610b486422d4921ae7d2bf64640c50b|20170715     |51+      |
|118|NULL             |NULL  |38fe809add3b4fcf9315a9694bb96ff5|20180712     |51+      |
|75 |100000.0         |F     |78afa995795e4d85b5d9ceeca43f5fef|20170509     |51+      |
|118|NULL             |NULL  |a03223e636434f42ac4c3df47e8bac43|20170804     |51+      |
|68 |70000.0          |M     |e2127556f4f64592b11af22de27a7932|20180426     |51+      |
|118|NULL             |NULL  |8ec6ce2a7e7949b1bf142def7d0e0586|20170925     |51+      |
|118|NULL             |NULL  |68617ca6246f4fbc85e91a2a49552598|20171002     |51+      |
|65 |53000.0          |M     |38