In [None]:
import pandas as pd
from google.cloud import bigquery
from pathlib import Path

In [None]:
# initializing a client for google cloud

client = bigquery.Client()

In [None]:
# setting up paths and reading the sql files.

RAW_DATA_PATH = Path("../data/raw")
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)

EVENTS_SQL_PATH = Path("../sql/fact_events.sql")
PRODUCTS_SQL_PATH = Path("../sql/fact_products.sql")

with open(EVENTS_SQL_PATH, "r") as f:
    events_sql = f.read()

with open(PRODUCTS_SQL_PATH, "r") as f:
    products_sql = f.read()

In [None]:
# loading the data from bigquery to pandas dataframes

job1 = client.query(events_sql)
events_df = job1.to_dataframe()

job2 = client.query(products_sql)
products_df = job2.to_dataframe()

In [None]:
# cleaning up the date for next stage. Its required as pandas dont recognize dbdate type for 'event_date'

events_df['event_date'] = pd.to_datetime(events_df['event_date'])
products_df['event_date'] = pd.to_datetime(products_df['event_date'])

In [None]:
# extracting the raw data to folder

OUTPUT_FILE_PATH1 = RAW_DATA_PATH / "fact_events_raw.parquet"
OUTPUT_FILE_PATH2 = RAW_DATA_PATH / "fact_products_raw.parquet"
events_df.to_parquet(OUTPUT_FILE_PATH1, index = False)
products_df.to_parquet(OUTPUT_FILE_PATH2, index = False)

In [None]:
# extracting a sample dataset for repo

SAMPLE_PATH = Path("../data/sample")
SAMPLE_PATH.mkdir(parents=True, exist_ok=True)

SAMPLE_OUTPUT1 = SAMPLE_PATH / "fact_events_raw_sample.parquet"
SAMPLE_OUTPUT2 = SAMPLE_PATH / "fact_products_raw_sample.parquet"

events_df.sample(2000, random_state = 42).to_parquet(SAMPLE_OUTPUT1, index = False)
products_df.sample(2000, random_state = 42).to_parquet(SAMPLE_OUTPUT2, index = False)