# Phase 1A: Data Understanding (Raw Data Inspection) 

## Objective
- Understand raw vendor, purchase, and sales data structure and
- create statistically representative samples for downstream analysis.

In [9]:
# --------------------------------------------
# Phase 1A: Data Understanding
# Inspect raw CSV structure without loading
# entire large files into memory
# --------------------------------------------

import pandas as pd
import os

# Path to raw data directory
DATA_PATH = "../Data"

def inspect_csv(file_path, rows=5):
    print(f"\nInspecting file: {os.path.basename(file_path)}")
    for chunk in pd.read_csv(file_path, chunksize=100_000):
        print(chunk.head(rows))
        print("\nColumns:", list(chunk.columns))
        break

for file in os.listdir(DATA_PATH):
    if file.endswith(".csv"):
        inspect_csv(os.path.join(DATA_PATH, file))


Inspecting file: begin_inventory.csv
         InventoryId  Store          City  Brand                  Description  \
0  1_HARDERSFIELD_58      1  HARDERSFIELD     58  Gekkeikan Black & Gold Sake   
1  1_HARDERSFIELD_60      1  HARDERSFIELD     60       Canadian Club 1858 VAP   
2  1_HARDERSFIELD_62      1  HARDERSFIELD     62     Herradura Silver Tequila   
3  1_HARDERSFIELD_63      1  HARDERSFIELD     63   Herradura Reposado Tequila   
4  1_HARDERSFIELD_72      1  HARDERSFIELD     72         No. 3 London Dry Gin   

    Size  onHand  Price   startDate  
0  750mL       8  12.99  2024-01-01  
1  750mL       7  10.99  2024-01-01  
2  750mL       6  36.99  2024-01-01  
3  750mL       3  38.99  2024-01-01  
4  750mL       6  34.99  2024-01-01  

Columns: ['InventoryId', 'Store', 'City', 'Brand', 'Description', 'Size', 'onHand', 'Price', 'startDate']

Inspecting file: end_inventory.csv
         InventoryId  Store          City  Brand                  Description  \
0  1_HARDERSFIELD_58   

Phase 1B: Smart Sampling for Large Datasets

In [11]:
# --------------------------------------------
# Phase 1B: Smart Sampling
# Sample large datasets safely using chunks
# --------------------------------------------

import pandas as pd
import os

DATA_PATH = "../Data"
OUTPUT_PATH = "../Sampled"

os.makedirs(OUTPUT_PATH, exist_ok=True)

def chunk_sample_csv(input_file, output_file, frac, chunksize=100_000):
    sampled_chunks = []

    for chunk in pd.read_csv(input_file, chunksize=chunksize):
        sampled_chunks.append(chunk.sample(frac=frac, random_state=42))

    sampled_df = pd.concat(sampled_chunks, ignore_index=True)
    sampled_df.to_csv(output_file, index=False)
    print(f"Saved: {output_file}")

In [13]:
# Purchases
chunk_sample_csv(
    f"{DATA_PATH}/purchases.csv",
    f"{OUTPUT_PATH}/purchases_sampled.csv",
    frac=0.15
)

Saved: ../Sampled/purchases_sampled.csv


In [15]:
# Sales
chunk_sample_csv(
    f"{DATA_PATH}/sales.csv",
    f"{OUTPUT_PATH}/sales_sampled.csv",
    frac=0.10
)

Saved: ../Sampled/sales_sampled.csv


In [17]:
# Inventory
chunk_sample_csv(
    f"{DATA_PATH}/begin_inventory.csv",
    f"{OUTPUT_PATH}/begin_inventory_sampled.csv",
    frac=0.30
)

chunk_sample_csv(
    f"{DATA_PATH}/end_inventory.csv",
    f"{OUTPUT_PATH}/end_inventory_sampled.csv",
    frac=0.30
)

Saved: ../Sampled/begin_inventory_sampled.csv
Saved: ../Sampled/end_inventory_sampled.csv


In [19]:
# Purchase Prices (small enough to keep full)
pd.read_csv(
    f"{DATA_PATH}/purchase_prices.csv"
).to_csv(
    f"{OUTPUT_PATH}/purchase_prices_sampled.csv",
    index=False
)

In [21]:
# Vendor Invoice
chunk_sample_csv(
    f"{DATA_PATH}/vendor_invoice.csv",
    f"{OUTPUT_PATH}/vendor_invoice_sampled.csv",
    frac=0.25
)

Saved: ../Sampled/vendor_invoice_sampled.csv
