In [27]:
import os, pathlib, datetime as dt
from dotenv import load_dotenv
import zipfile
import numpy as np
import pandas as pd

In [28]:


# Step 1: Find project root dynamically (look for folder containing .env)
current_dir = os.getcwd()
project_root = current_dir

while not os.path.isfile(os.path.join(project_root, ".env")):
    # Go one directory up
    parent_dir = os.path.abspath(os.path.join(project_root, ".."))
    if parent_dir == project_root:
        raise FileNotFoundError(".env file not found in any parent directory")
    project_root = parent_dir

# Step 2: Load .env from project root
load_dotenv(os.path.join(project_root, ".env"))

RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))

RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /Users/aditya/Documents/bootcamp_aditya_shah/project/data_storage/data/raw
PROC -> /Users/aditya/Documents/bootcamp_aditya_shah/project/data_storage/data/processed


In [29]:
path = os.getenv("DOWNLOAD_DATA_ACQUISITION_DIRECTORY") + "/USDZAR-2025-07.zip"
print('Path to zip file:', path)

Path to zip file: /Users/aditya/Documents/bootcamp_aditya_shah/project/data_acquisition/data/USDZAR-2025-07.zip


In [30]:
df = pd.read_csv(path, header= None)
print(df.head())


         0                      1        2        3
0  USD/ZAR  20250701 00:00:00.078  17.7147  17.7272
1  USD/ZAR  20250701 00:00:00.141  17.7123  17.7338
2  USD/ZAR  20250701 00:00:00.156  17.7167  17.7267
3  USD/ZAR  20250701 00:00:00.234  17.7185  17.7267
4  USD/ZAR  20250701 00:00:00.234  17.7185  17.7272


In [31]:
df.columns = ["exchange", "date_time", "bid", "ask"]
df.head(5)

Unnamed: 0,exchange,date_time,bid,ask
0,USD/ZAR,20250701 00:00:00.078,17.7147,17.7272
1,USD/ZAR,20250701 00:00:00.141,17.7123,17.7338
2,USD/ZAR,20250701 00:00:00.156,17.7167,17.7267
3,USD/ZAR,20250701 00:00:00.234,17.7185,17.7267
4,USD/ZAR,20250701 00:00:00.234,17.7185,17.7272


In [32]:

# TODO: Save CSV
csv_path = RAW / "USD_ZAR_2025_07.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"USD_ZAR_2025_07.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

PosixPath('/Users/aditya/Documents/bootcamp_aditya_shah/project/data_storage/data/processed/USD_ZAR_2025_07.parquet')

In [33]:


def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame) -> dict:
    """
    Validate that the reloaded DataFrame matches expected structure and dtypes.
    """
    checks = {
        # Shape check
        'shape_equal': original.shape == reloaded.shape,
        
        # Column checks
        'has_columns': list(reloaded.columns) == ["exchange", "date_time", "bid", "ask"],
        
        # Type checks
        'exchange_is_string': pd.api.types.is_string_dtype(reloaded['exchange']) if 'exchange' in reloaded.columns else False,
        'date_time_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date_time']) if 'date_time' in reloaded.columns else False,
        'bid_is_numeric': pd.api.types.is_numeric_dtype(reloaded['bid']) if 'bid' in reloaded.columns else False,
        'ask_is_numeric': pd.api.types.is_numeric_dtype(reloaded['ask']) if 'ask' in reloaded.columns else False,
    }
    return checks


In [34]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)