In [1]:
# !pip install scikit-learn

In [2]:
import os, pathlib, datetime as dt
import pandas as pd
import numpy as np
import sys
from sklearn.preprocessing import MinMaxScaler, StandardScaler
sys.path.append(os.getcwd())
from src.cleaning import *
from dotenv import load_dotenv

load_dotenv()

RAW_DIR=pathlib.Path((os.getenv("DATA_DIR_RAW","data/raw")))
PROCESS_DIR=pathlib.Path(os.getenv("PROCESS_DIR_RAW","data/processed"))
NOTE_DIR=pathlib.Path(os.getenv("NOTE_DIR_RAW","notebook"))
SRC_DIR=pathlib.Path(os.getenv("SRC_DIR_RAW","src"))
RAW_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",RAW_DIR.resolve())
PROCESS_DIR.mkdir(parents=True, exist_ok=True)
print("PROCESS_DIR:",PROCESS_DIR.resolve())
NOTE_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",NOTE_DIR.resolve())
SRC_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",SRC_DIR.resolve())

RAW_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework6\data\raw
PROCESS_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework6\data\processed
RAW_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework6\notebook
RAW_DIR: C:\Users\My PC\bootcamp_dimil_patel\homework\homework6\src


In [3]:
csv_path = 'data/raw/unclean_data.csv'
os.makedirs('data', exist_ok=True)

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'numeric_col': [10, None, 40, 55, 70],
        'category_col': ['A', 'B', 'A', 'B', 'C'],
        'price': ['$100', '$200', '$150', None, '$250'],
        'date_str': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

Demo CSV created at data/raw/unclean_data.csv


Load Raw Dataset

In [4]:
df = pd.read_csv('data/unclean_data.csv')
df_clean=df
df.head()


Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,$100,2025-08-01,Electronics
1,,B,$200,2025-08-02,Furniture
2,40.0,A,$150,,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,$250,2025-08-05,


Data Type Corrections

In [5]:
if 'price' in df.columns:
    df_clean['price'] = df_clean['price'].str.replace('$','').astype(float)
if 'date_str' in df.columns:
    df_clean['date_str'] = pd.to_datetime(df_clean['date_str'], errors='coerce')
if 'category' in df.columns:
    df_clean['category'] = df_clean['category'].str.lower().astype('category')
df_clean

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,100.0,2025-08-01,electronics
1,,B,200.0,2025-08-02,furniture
2,40.0,A,150.0,NaT,toys
3,55.0,B,,2025-08-04,clothing
4,70.0,C,250.0,2025-08-05,


In [8]:
df_clean['date_str']=df_clean['date_str'].ffill()
df_clean = fill_missing_median(df_clean)
df_clean = drop_missing(df_clean, threshold=1)
df_clean = normalize_data(df_clean, method='minmax')
cleancsv_path = 'data/processed/clean_data.csv'
df_clean.to_csv(cleancsv_path, index=False)
df_clean

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,0.0,A,0.0,2025-08-01,electronics
1,0.833333,B,1.0,2025-08-02,furniture
2,0.666667,A,0.5,2025-08-02,toys
3,1.0,B,0.75,2025-08-04,clothing


Comparison between Unclean vs clean

In [9]:
print(f"Unclean shape:       {df.shape}")
print(f"Clean shape: {df_clean.shape}")

Unclean shape:       (5, 5)
Clean shape: (4, 5)


In [10]:
print("=== Missing Values Unclean ===")
print(df.isna().sum().sum())
print("=== Missing Values Clean ===")
print(df_clean.isna().sum().sum())

=== Missing Values Unclean ===
3
=== Missing Values Clean ===
0
