### Cleaning Overview
- Input: Raw GA4 event-level Parquet snapshot
- Objective: Prepare analytics-ready dataset
- Key cleaning steps:
  - Null handling
  - Deduplication
  - Type standardization
  - Feature derivation
- Output: Clean Parquet for downstream analysis


In [1]:
# importing necessary libraries

import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
# since sample file is included in repo, adding toggle to switch file based on scenario

use_sample = False

if use_sample == True:
    RAW_DATA_PATH = Path("../data/sample")
    raw_file = RAW_DATA_PATH / "fact_events_raw_sample.parquet"
else:
    RAW_DATA_PATH = Path("../data/raw")
    raw_file = RAW_DATA_PATH / "fact_events_raw.parquet"

CLEAN_DATA_PATH = Path("../data/cleaned")


df = pd.read_parquet(raw_file)
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2251084 entries, 0 to 2251083
Data columns (total 14 columns):
 #   Column                Non-Null Count    Dtype              
---  ------                --------------    -----              
 0   user_id               2251084 non-null  object             
 1   event_name            2251084 non-null  object             
 2   transaction_id        4786 non-null     object             
 3   event_date            2251084 non-null  datetime64[ns]     
 4   event_ts              2251084 non-null  datetime64[us, UTC]
 5   engagement_time_msec  1583200 non-null  Int64              
 6   session_id            2251084 non-null  Int64              
 7   device                2251084 non-null  object             
 8   country               2251084 non-null  object             
 9   region                2251084 non-null  object             
 10  city                  2251084 non-null  object             
 11  traffic_sname         2251084 non-nul

In [4]:
# handling nulls and duplicates

df['engagement_time_msec'] = df['engagement_time_msec'].fillna(0)
df = df.drop_duplicates(subset= ['user_id','event_name','event_ts'])

In [8]:
# normalizing columns

for col in ['traffic_smedium', 'traffic_source', 'traffic_sname']:
    df[col] = df[col].fillna('').str.lower().str.strip()


# derriving categories for marketing channel

conditions = [
    # privacy
    (df['traffic_smedium'] == '(data deleted)') |
    (df['traffic_source'] == '(data deleted)' ) |
    (df['traffic_sname'] == '(data deleted)'),
    
    # direct traffic
    (df['traffic_smedium'] == '(none)')         &
    (df['traffic_source'] == '(direct)'),

    # organic traffic
    (df['traffic_smedium'] == 'organic'),

    #paid
    (df['traffic_smedium'] == 'cpc'),

    #referral
    (df['traffic_smedium'] == 'referral')
]

choices = [
    'unknown_privacy',
    'direct',
    'organic',
    'paid',
    'referral'
]

In [9]:
# adding marketing channel category

df['marketing_channel'] = np.select(conditions,choices,default='others')


In [10]:
# categorzing columns and adding flags

category_columns = ['event_name','device','country','marketing_channel']
for col in category_columns:
    df[col] = df[col].astype('category')

df['is_purchase'] = df['transaction_id'].notna()
df['is_engaged_event'] = df['engagement_time_msec'] > 0

In [None]:
# exporting cleaned parquet file to cleaned

OUTPUT_PATH = CLEAN_DATA_PATH / "fact_events_cleaned.parquet"

df.to_parquet(OUTPUT_PATH,index = False)