# Imports

In [None]:
import pandas as pd
from glob import glob

# Read Purchases JSON files

In [None]:
json_files = glob('../data/purchases_*.json')

df_purchases_list = [pd.read_json(file) for file in json_files]

In [None]:
df_purchases = pd.concat(df_purchases_list, ignore_index=True)

In [None]:
df_purchases.head(10)

In [None]:
df_purchases.dtypes

## Fill missing values

In [136]:
df_purchases_fillna = df_purchases.fillna({
    'channel': 'No Channel',
    'discount_code': ''
}).assign(
    # date = df_purchases['timestamp'].dt.date  # This references original df, not the result of fillna()
    date=lambda x: x['timestamp'].dt.date  # References the dateframe returned by fillna()
)

In [135]:
df_purchases_fillna

Unnamed: 0,customer_id,purchase_amount_usd,timestamp,discount_code,channel,date
0,15,127.71,2025-03-18 14:03:59,,No Channel,2025-03-18
1,1,147.23,2025-01-08 02:27:08,,No Channel,2025-01-08
2,44,354.33,2024-07-13 08:36:35,news,web,2024-07-13
3,50,251.06,2024-09-26 21:06:29,though,store,2024-09-26
4,30,63.10,2024-08-29 15:34:50,line,web,2024-08-29
...,...,...,...,...,...,...
495,45,329.82,2024-08-24 12:14:46,that,No Channel,2024-08-24
496,41,87.95,2025-05-11 03:32:22,,web,2025-05-11
497,19,171.71,2024-07-27 03:33:29,,No Channel,2024-07-27
498,8,192.92,2025-05-25 05:04:18,window,No Channel,2025-05-25


# Read currency conversion CSV

In [None]:
df_conversion = pd.read_csv('../data/currency_conversion.csv')

In [None]:
df_conversion.dtypes

In [None]:
df_conversion

In [None]:
# .loc with boolean indexing - Single operation, most efficient
rate_col = df_conversion.loc[df_conversion['currency'] == 'USD', 'conversion_rate_to_brl'].iloc[0]

# There are several ways to extract a value from a specific row and column.
# The version applied here is the most appropriate for performance e readability.
# Other ways could be:
# df[df['currency'] == 'USD']['conversion_rate_to_brl'].iloc[0] -> Two operations but still fast
# df.query("currency == 'USD'")['conversion_rate_to_brl'].iloc[0] -> String parsing overhead, two operations
# df[df['currency'] == 'USD']['conversion_rate_to_brl'].values[0] -> Converts to numpy array unnecessarily

In [None]:
rate_col

# Add converted values

In [137]:
# There are several ways to do that. assing() is the most
# efficient and readable. It performs just a single operation,
# creates one new dataframe
# Readability & Maintainability:
# Declarative style: Clearly shows what columns are being added
# Method chaining friendly: Can easily chain other operations
# Immutable approach: Doesn't modify the original dataframe
# Single operation: All column additions happen at once

# Other options would involve more operation like copying the DF,
# then adding new columns
df_purchases_brl = df_purchases_fillna.assign(
    brl_conversion_rate=rate_col,
    purchase_amount_brl=round(df_purchases_fillna['purchase_amount_usd'] * rate_col, 2)
)

In [None]:
df_purchases_brl

# Read customers file

In [None]:
df_customers = pd.read_csv('../data/sql_customers.csv', parse_dates=['signup_date'])

In [None]:
# df_customers.dtypes
df_customers

# Join customers with purchases

In [141]:
df_purchases_customers = (df_purchases_brl.merge(df_customers,
                                                 left_on="customer_id",
                                                 right_on="id",
                                                 how="left")
                          .drop('id', axis=1)  # Axis = 1 makes it the drop column, the default axis=0 is for rows
                          .fillna({'name': ""}))

In [142]:
df_purchases_customers

Unnamed: 0,customer_id,purchase_amount_usd,timestamp,discount_code,channel,date,brl_conversion_rate,purchase_amount_brl,name,signup_date
0,15,127.71,2025-03-18 14:03:59,,No Channel,2025-03-18,5.3,676.86,John Gonzalez,2025-04-03
1,1,147.23,2025-01-08 02:27:08,,No Channel,2025-01-08,5.3,780.32,Keith Gordon,2024-09-29
2,44,354.33,2024-07-13 08:36:35,news,web,2024-07-13,5.3,1877.95,,NaT
3,50,251.06,2024-09-26 21:06:29,though,store,2024-09-26,5.3,1330.62,,NaT
4,30,63.10,2024-08-29 15:34:50,line,web,2024-08-29,5.3,334.43,,NaT
...,...,...,...,...,...,...,...,...,...,...
495,45,329.82,2024-08-24 12:14:46,that,No Channel,2024-08-24,5.3,1748.05,,NaT
496,41,87.95,2025-05-11 03:32:22,,web,2025-05-11,5.3,466.14,,NaT
497,19,171.71,2024-07-27 03:33:29,,No Channel,2024-07-27,5.3,910.06,Aaron Park,2023-10-06
498,8,192.92,2025-05-25 05:04:18,window,No Channel,2025-05-25,5.3,1022.48,James Taylor,2023-08-21


# Group by customer (groupby, agg)

In [160]:
df_total_per_customer = df_purchases_customers.groupby(by=['customer_id', 'name', 'date']).agg(
    total_amount_usd=("purchase_amount_usd", "sum"),
    total_amount_brl=("purchase_amount_brl", "sum")
).reset_index()

In [161]:
df_total_per_customer

Unnamed: 0,customer_id,name,date,total_amount_usd,total_amount_brl
0,1,Keith Gordon,2024-08-05,273.11,1447.48
1,1,Keith Gordon,2024-08-10,335.90,1780.27
2,1,Keith Gordon,2024-09-20,388.59,2059.53
3,1,Keith Gordon,2024-10-10,135.17,716.40
4,1,Keith Gordon,2025-01-08,147.23,780.32
...,...,...,...,...,...
488,50,,2024-12-06,10.14,53.74
489,50,,2024-12-22,345.25,1829.82
490,50,,2025-02-20,457.06,2422.42
491,50,,2025-05-19,222.59,1179.73


In [163]:
df_total_per_customer[df_total_per_customer["customer_id"] == 44]

Unnamed: 0,customer_id,name,date,total_amount_usd,total_amount_brl
413,44,,2024-06-27,134.75,714.18
414,44,,2024-07-13,354.33,1877.95
415,44,,2024-07-20,315.85,1674.0
416,44,,2024-07-24,323.61,1715.13
417,44,,2024-12-09,70.12,371.64
418,44,,2024-12-22,112.76,597.63
419,44,,2024-12-24,350.31,1856.64
420,44,,2024-12-30,22.21,117.71
421,44,,2025-01-14,36.67,194.35
422,44,,2025-02-13,155.77,825.58
