In [1]:
import os, pathlib, datetime as dt
import pandas as pd
import numpy as np
from dotenv import load_dotenv

load_dotenv()
RAW_DIR = pathlib.Path(os.getenv("DATA_DIR_RAW", "data/raw"))
PROC_DIR = pathlib.Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:", RAW_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())

RAW_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework5\data\raw
PROC_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework5\data\processed


Creating Random DataFrame

In [2]:
n = 180  # ~6 months of daily data
dates = pd.date_range('2024-01-01', periods=n, freq='D')
age = np.random.normal(30, 6, size=n).clip(18, 42)
income = np.random.lognormal(mean=10.82, sigma=0.3, size=n)  # lognormal skew

# Assemble
df = pd.DataFrame({
    'date': dates,
    'age': age.round(0),
    'income': income.round(2),
})
df.head()

Unnamed: 0,date,age,income
0,2024-01-01,26.0,26897.08
1,2024-01-02,37.0,46257.21
2,2024-01-03,36.0,25075.84
3,2024-01-04,36.0,33176.24
4,2024-01-05,33.0,35312.61


In [3]:
csv_path = RAW_DIR / f"Income_Data.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV →", csv_path)

parq_path = PROC_DIR / f"Income_Data.parquet"
try:
    df.to_parquet(parq_path)  # uses installed engine if available
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?). Skipping Parquet demo.")
    print("Error:", e)

Saved CSV → data\raw\Income_Data.csv
Saved Parquet → data\processed\Income_Data.parquet


Reload and validate

In [22]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=('date','age','income')):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'cols_present': all(c in reloaded.columns for c in cols)
    }
    # dtype sanity checks
    if 'age' in reloaded.columns:
        checks['age_is_float'] = pd.api.types.is_float_dtype(reloaded['age'])
    if 'income' in reloaded.columns:
        checks['income_is_numeric'] = pd.api.types.is_numeric_dtype(reloaded['income'])
    if 'date' in reloaded.columns:
        checks['date_is_datetime'] = pd.api.types.is_datetime64_any_dtype(reloaded['date'])
    return checks

In [23]:
df_csv = pd.read_csv(csv_path,parse_dates=['date'])
print('CSV validation:', validate_loaded(df, df_csv))
df_parq = pd.read_parquet(parq_path)
print('Parquet validation:', validate_loaded(df, df_parq))

CSV validation: {'shape_equal': True, 'cols_present': True, 'age_is_float': True, 'income_is_numeric': True, 'date_is_datetime': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'age_is_float': True, 'income_is_numeric': True, 'date_is_datetime': True}


Wriet and Read Util

In [16]:
from typing import Union

def ensure_dir(path: pathlib.Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df: pd.DataFrame, path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e    
    return path
def read_df(path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
testcsv = RAW_DIR / f"Income_Data_util.csv"
testpq  = PROC_DIR / f"Income_Data_util.parquet"
write_df(df, testcsv)
df2 = read_df(testcsv)
print('Reloaded CSV via util, shape:', df2.shape)

try:
    write_df(df, testpq)
    df3 = read_df(testpq)
    print('Reloaded Parquet via util, shape:', df3.shape)
except RuntimeError as e:
    print('Parquet util demo skipped:', e)    

Reloaded CSV via util, shape: (180, 3)
Reloaded Parquet via util, shape: (180, 3)
