# Imports



In [1]:
import os
import sys
from glob import glob

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Required to use the correct Java version
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk@17'

# Start Spark App

In [None]:
spark = SparkSession.builder.appName("interview-pre").getOrCreate()

# Read Purchases JSON files

In [3]:
json_files = glob("../data/purchases_*.json")

# @TODO
# Specify the schema since it is not complex and will
# make json reading process lighter

df_purchases = spark.read.json(json_files, multiLine=True)

In [None]:
df_purchases.show(10)

In [None]:
df_purchases.printSchema()

## Cast columns types

In [5]:
df_purchases_columns_cast = df_purchases.withColumns({
    'timestamp': F.col('timestamp').cast('timestamp'),
    'date': F.to_date('timestamp'),
})

In [None]:
df_purchases_columns_cast.printSchema()

In [None]:
df_purchases_columns_cast.show(3)

## Fill missing values

In [6]:
df_purchases_fillna = df_purchases_columns_cast.fillna({
    "channel": "No Channel",
    "discount_code": ""
})

In [None]:
df_purchases_fillna.show(10)

# Read currency conversion CSV

In [7]:
csv_currency_path = '../data/currency_conversion.csv'

In [None]:
# Read first 10 lines so I can see what to expect of the file content
with open(csv_currency_path) as f:
    for _ in range(10):
        print(f.readline(),
              end='')  # If a file is just one line long and it is huge, this call will still read the entire file in memory

In [None]:
with open(csv_currency_path) as f:
    # This way we are reading just the first 2KB bytes of data
    text = f.read(2048)
    print(text)

In [8]:
df_conversion = spark.read.csv(csv_currency_path, header=True)

In [None]:
df_conversion.show(10)

## Extract rate from CSV

In [9]:
conversion_rate_to_brl = df_conversion \
    .filter(F.col('currency') == 'USD') \
    .select('conversion_rate_to_brl') \
    .first()['conversion_rate_to_brl']

conversion_rate_to_brl = float(conversion_rate_to_brl)

# Add converted values

In [10]:
df_purchases_brl = df_purchases_fillna \
    .withColumns(
    {
        'brl_conversion_rate': F.lit(conversion_rate_to_brl),
        'purchase_amount_brl': F.round(F.col('purchase_amount_usd') * conversion_rate_to_brl, 2),
    }
)


In [None]:
# df_purchases_brl.show(10)
df_purchases_brl.printSchema()

# Read customers file

In [38]:
df_customers = spark.read.csv('../data/sql_customers.csv', header=True, dateFormat="%Y-%m-%d", inferSchema=True)
df_customers.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)



## Cast timestamp to date

In [34]:
df_customer_cast = df_customers.withColumn('signup_date', F.to_date('signup_date'))

In [36]:
df_customer_cast.show(2)

+---+---------------+-----------+
| id|           name|signup_date|
+---+---------------+-----------+
|  1|   Keith Gordon| 2024-09-29|
|  2|Jonathan Bolton| 2025-02-05|
+---+---------------+-----------+
only showing top 2 rows


In [41]:
# df_customers.show(2)
df_customer_cast.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- signup_date: date (nullable = true)



# Join customers with purchases

In [54]:
# help(df_purchases_brl.join)
print(df_customer_cast.where("id = 18").first())

18


In [66]:
df_purchases_customers = df_purchases_brl.join(
    df_customer_cast.select("id", "name", "signup_date"),
    on=df_purchases_brl.customer_id == df_customer_cast.id,
    how="left") \
    .drop("id") \
    .fillna({
    'name': ''
})

In [67]:
df_purchases_customers.show(2)

+----------+-----------+-------------+-------------------+-------------------+----------+-------------------+-------------------+------------+-----------+
|   channel|customer_id|discount_code|purchase_amount_usd|          timestamp|      date|brl_conversion_rate|purchase_amount_brl|        name|signup_date|
+----------+-----------+-------------+-------------------+-------------------+----------+-------------------+-------------------+------------+-----------+
|     store|         45|     remember|             374.03|2025-04-21 16:40:36|2025-04-21|                5.3|            1982.36|            |       NULL|
|No Channel|         18|             |              245.4|2025-02-08 19:42:46|2025-02-08|                5.3|            1300.62|Kristin Dunn| 2023-12-27|
+----------+-----------+-------------+-------------------+-------------------+----------+-------------------+-------------------+------------+-----------+
only showing top 2 rows


# Group by customer (groupby, agg)

In [68]:
df_total_per_customer = df_purchases_customers.groupby(F.col('customer_id'), F.col('name'), F.col('date')).agg(
    F.count("*").alias("nr_purachases"),
    F.sum("purchase_amount_usd").alias("total_amount_usd"),
    F.sum("purchase_amount_brl").alias("total_amount_brl")
)


In [69]:
df_total_per_customer.show(5)

+-----------+------------+----------+-------------+----------------+----------------+
|customer_id|        name|      date|nr_purachases|total_amount_usd|total_amount_brl|
+-----------+------------+----------+-------------+----------------+----------------+
|          6|   Karen Ray|2025-01-15|            1|          376.07|         1993.17|
|         47|            |2024-07-10|            1|          305.42|         1618.73|
|         14|Katelyn Dunn|2024-12-08|            1|          403.69|         2139.56|
|         29|            |2025-03-14|            1|          380.77|         2018.08|
|         28|            |2024-07-22|            1|           93.48|          495.44|
+-----------+------------+----------+-------------+----------------+----------------+
only showing top 5 rows


# Write to Parquet (partitioned)

In [None]:
df_total_per_customer.write.partitionBy('date') \
    .parquet("../data/output", mode="overwrite")