## Helper cleaning.py
have been placed in the /src/ fold later

In [2]:
import sys
from pathlib import Path

# Add project root (parent of 'src') to sys.path
PROJECT_ROOT = Path("..").resolve() # assume we have cd to /notebooks
sys.path.append(str(PROJECT_ROOT))

from src.cleaning import fill_missing_median, drop_missing, normalize_data

## Load Data

In [3]:
import os # interacts with your operating system (e.g., get environment variables).
import pathlib # a modern and cleaner way to handle file paths.
from pathlib import Path
import datetime 
import pandas as pd
from dotenv import load_dotenv # loads environment variables from a .env file into Python’s environment.
import numpy as np

In [4]:
PROJECT_ROOT = Path("..").resolve() # use absolute path to make it more stable
ENV_PATH = PROJECT_ROOT / ".env"

load_dotenv(override=True) # cause of cached, always check momery first instead of checking disk, so always override
RAW = (PROJECT_ROOT / os.getenv("DATA_DIR_RAW", "data/raw")).resolve()
PROC = (PROJECT_ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")).resolve()
if not str(RAW).startswith(str(PROJECT_ROOT)):
    raise ValueError(f"RAW path is outside project root! → {RAW}")
if not str(PROC).startswith(str(PROJECT_ROOT)):
    raise ValueError(f"RAW path is outside project root! → {PROC}")
RAW.mkdir(parents=True, exist_ok=True) 
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve()) # .resolve(): Returns the absolute path, with all symbolic links, relative .. or . components resolved (i.e., “cleaned up”)
print('PROC ->', PROC.resolve())

RAW -> /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/raw
PROC -> /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/processed


In [5]:
# load data from raw fold
file_path = Path(RAW/"sample_20250824-223858.csv")
df = pd.read_csv(file_path, parse_dates=["Date"])
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-02,27.8475,27.860001,26.8375,27.3325,24.261044,212818400
1,2015-01-05,27.0725,27.1625,26.352501,26.5625,23.577574,257142000
2,2015-01-06,26.635,26.8575,26.157499,26.565001,23.579796,263188400
3,2015-01-07,26.799999,27.049999,26.674999,26.9375,23.910435,160423600
4,2015-01-08,27.307501,28.0375,27.174999,27.9725,24.829128,237458000


## Apply cleaning functions

In [6]:
# Step 1: Fill missing values
df_filled = fill_missing_median(df)

In [7]:
# Step 2: Drop columns with too many missing values
df_dropped = drop_missing(df_filled, thresh=0.5)

In [8]:
# Step 3: Normalize numeric columns
df_cleaned = normalize_data(df_dropped)

In [9]:
df_cleaned

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-02,-1.089266,-1.093366,-1.099694,-1.096824,-1.101457,1.459293
1,2015-01-05,-1.100432,-1.103306,-1.106752,-1.107905,-1.111235,2.110654
2,2015-01-06,-1.106735,-1.107652,-1.109589,-1.107869,-1.111204,2.199510
3,2015-01-07,-1.104358,-1.104909,-1.102058,-1.102508,-1.106473,0.689320
4,2015-01-08,-1.097046,-1.090837,-1.094782,-1.087613,-1.093330,1.821386
...,...,...,...,...,...,...,...
2671,2025-08-18,1.847702,1.831713,1.858475,1.832696,1.854724,-1.117464
2672,2025-08-19,1.841651,1.828151,1.847415,1.827947,1.850003,-1.089155
2673,2025-08-20,1.822921,1.793949,1.795316,1.762465,1.784908,-1.047106
2674,2025-08-21,1.769470,1.737660,1.766356,1.746490,1.769027,-1.218203


## Save cleaned dataset to /data/processed/

In [10]:
def ts(): 
    return datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

# Generate a single timestamp for consistent file naming
timestamp = ts()

# Save cleaned DataFrame to processed folder
csv_path = PROC / f"aapl_cleaned_{timestamp}.csv"
df_cleaned.to_csv(csv_path, index=False)

print(f"✅ Cleaned CSV saved to: {csv_path}")

✅ Cleaned CSV saved to: /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/processed/aapl_cleaned_20250825-163141.csv


## Compare original vs cleaned data
Comparison: Original vs Cleaned Data

We compare the original and cleaned datasets to understand the impact of preprocessing:

- The number of rows remained the same, but columns with too many missing values were dropped.
- Median imputation reduced the number of missing values to zero in numeric columns.
- All numeric columns were standardized using z-score normalization.

In [11]:
from IPython.display import display # Displayed things more beautifully

df_raw = df
# 1. Shape comparison
print("Original shape:", df_raw.shape)
print("Cleaned shape: ", df_cleaned.shape)

# 2. Missing values comparison
print("\n Missing values per column (Before vs After):")
missing_df = pd.DataFrame({
    "Before": df_raw.isna().sum(),
    "After": df_cleaned.isna().sum()
})
display(missing_df)

# 3. Descriptive stats comparison
print("\n Descriptive statistics (original):")
display(df_raw.describe())

print("\n Descriptive statistics (cleaned):")
display(df_cleaned.describe())

# 4. Dropped columns
dropped_cols = df_raw.columns.difference(df_cleaned.columns)
if not dropped_cols.empty:
    print("\n Dropped columns due to missing values:", list(dropped_cols))
else:
    print("\n No columns were dropped.")


Original shape: (2676, 7)
Cleaned shape:  (2676, 7)

 Missing values per column (Before vs After):


Unnamed: 0,Before,After
Date,0,0
Open,0,0
High,0,0
Low,0,0
Close,0,0
Adj Close,0,0
Volume,0,0



 Descriptive statistics (original):


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,2676,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0
mean,2020-04-26 03:05:39.013453056,103.452562,104.584179,102.40368,103.54522,101.249866,113517000.0
min,2015-01-02 00:00:00,22.5,22.9175,22.3675,22.584999,20.624048,23234700.0
25%,2017-08-28 18:00:00,38.699375,38.951875,38.355,38.71875,36.278358,67642580.0
50%,2020-04-27 12:00:00,79.172501,79.900002,78.557499,79.423752,76.879837,96469050.0
75%,2022-12-20 06:00:00,165.512497,167.284996,164.204998,165.660004,163.180233,138456600.0
max,2025-08-22 00:00:00,258.190002,260.100006,257.630005,259.019989,258.103729,648825200.0
std,,69.422133,70.185564,68.728512,69.497927,69.910315,68060340.0



 Descriptive statistics (cleaned):


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,2676,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0
mean,2020-04-26 03:05:39.013453056,-1.699355e-16,8.496774e-17,0.0,8.496774e-17,-8.496774e-17,-1.274516e-16
min,2015-01-02 00:00:00,-1.166309,-1.1638,-1.164744,-1.165148,-1.153491,-1.326752
25%,2017-08-28 18:00:00,-0.9329199,-0.9353002,-0.932083,-0.9329571,-0.9295288,-0.6741517
50%,2020-04-27 12:00:00,-0.3498106,-0.3517645,-0.347027,-0.3471467,-0.348655,-0.2505296
75%,2022-12-20 06:00:00,0.8941174,0.8935247,0.899377,0.8939316,0.8860201,0.3665022
max,2025-08-22 00:00:00,2.229352,2.216195,2.258965,2.237532,2.244063,7.86667
std,,1.000187,1.000187,1.000187,1.000187,1.000187,1.000187



 No columns were dropped.


## Document all assumptions clearly

### Assumptions Made During Data Cleaning

In the cleaning process, we made the following assumptions to ensure consistency, robustness, and reusability of our pipeline:

---

#### 1. Missing Value Imputation
- **Assumption:** For all numeric columns, missing values are **not missing completely at random**, and therefore carry some statistical meaning.
- **Action:** We impute missing numeric values using the **median** of each column, which is **robust to outliers** and helps preserve distribution shape.

---

#### 2. Column Removal Based on Missing Ratio
- **Assumption:** Columns with more than **50% missing values** are **not trustworthy** and provide little usable information.
- **Action:** We drop any column where more than 50% of the values are missing. This helps reduce noise and dimensionality.

---

#### 3. Feature Scaling (Normalization)
- **Assumption:** The dataset may be used for machine learning or statistical models that are **sensitive to feature scale** (e.g., k-NN, regression).
- **Action:** We standardize all numeric features using **z-score normalization** (mean = 0, std = 1) to put them on the same scale.

---

#### 4. Date Parsing
- **Assumption:** The `Date` column is essential for time-based operations and must be parsed correctly.
- **Action:** We convert the `Date` column to `datetime` format at load time to ensure temporal operations can be applied properly.

---

#### 5. Reproducibility
- **Assumption:** Data pipelines should be repeatable and trackable across time.
- **Action:** We add a **timestamp** to all saved output filenames to ensure version control and easy comparison across runs.

---

These assumptions are based on standard data preprocessing practices and aim to balance data retention, statistical rigor, and pipeline clarity.
