# Homework 6 — Stage 06: Data Preprocessing
**Name:** Panwei Hu  
**Date:** 2025-08-19
## Objectives
- Implement comprehensive data cleaning functions
- Handle missing values with multiple strategies
- Apply data normalization and scaling techniques
- Create reusable preprocessing pipeline
- Test with real-world messy data scenarios

## Focus Areas
- **Missing Value Handling**: Multiple imputation strategies
- **Data Normalization**: Scaling and standardization techniques
- **Outlier Detection**: Statistical and robust methods
- **Feature Engineering**: Data transformation and encoding
- **Pipeline Creation**: Modular, reusable preprocessing functions


In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Union, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up project paths
print("📊 Homework 6: Data Preprocessing")
print("="*50)

# Define folder paths relative to this notebook
raw_dir = 'data/raw'
processed_dir = 'data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

print(f"✅ Directory structure created:")
print(f"   Raw data: {os.path.abspath(raw_dir)}")
print(f"   Processed data: {os.path.abspath(processed_dir)}")

# Configure plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_palette("husl")


📊 Homework 6: Data Preprocessing
✅ Directory structure created:
   Raw data: /Users/panweihu/Desktop/Desktop_m1/NYU_mfe/bootcamp/camp4/bootcamp_bill_panwei_hu/homework/homework6/data/raw
   Processed data: /Users/panweihu/Desktop/Desktop_m1/NYU_mfe/bootcamp/camp4/bootcamp_bill_panwei_hu/homework/homework6/data/processed


In [2]:
# Generate Sample Dataset with Missing Values and Outliers
print("🔧 Creating comprehensive test dataset...")

# Set random seed for reproducibility
np.random.seed(42)

# Create a more comprehensive dataset
n_samples = 1000

# Generate base data
data = {
    # Numeric columns with different patterns
    'age': np.random.normal(35, 12, n_samples),
    'income': np.random.lognormal(10.5, 0.8, n_samples),
    'score': np.random.beta(2, 5, n_samples),
    'years_experience': np.random.poisson(8, n_samples),
    'rating': np.random.uniform(1, 5, n_samples),
    
    # Categorical columns
    'city': np.random.choice(['New York', 'San Francisco', 'Chicago', 'Austin', 'Boston'], n_samples, 
                            p=[0.3, 0.25, 0.2, 0.15, 0.1]),
    'department': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], n_samples),
    'education': np.random.choice(['Bachelor', 'Master', 'PhD', 'High School'], n_samples, 
                                 p=[0.4, 0.35, 0.15, 0.1])
}

# Create DataFrame
df = pd.DataFrame(data)

# Add some outliers intentionally
outlier_indices = np.random.choice(df.index, size=50, replace=False)
df.loc[outlier_indices[:25], 'income'] *= 5  # High income outliers
df.loc[outlier_indices[25:], 'age'] = np.random.uniform(80, 95, 25)  # Age outliers

# Introduce missing values with different patterns
missing_patterns = {
    'age': 0.15,  # 15% missing randomly
    'income': 0.10,  # 10% missing
    'score': 0.20,  # 20% missing
    'years_experience': 0.05,  # 5% missing
    'city': 0.08,  # 8% missing
    'education': 0.12  # 12% missing
}

for col, missing_rate in missing_patterns.items():
    missing_indices = np.random.choice(df.index, size=int(len(df) * missing_rate), replace=False)
    df.loc[missing_indices, col] = np.nan

# Add a column with mostly missing values
df['optional_field'] = np.nan
df.loc[np.random.choice(df.index, size=50, replace=False), 'optional_field'] = np.random.normal(100, 20, 50)

# Save to raw data
csv_path = os.path.join(raw_dir, 'comprehensive_dataset.csv')
df.to_csv(csv_path, index=False)

print(f"✅ Dataset created and saved to {csv_path}")
print(f"   Shape: {df.shape}")
print(f"   Missing values: {df.isnull().sum().sum()}")
print(f"   Columns with missing data: {df.isnull().sum()[df.isnull().sum() > 0].to_dict()}")

# Display basic info
print(f"\n📊 Dataset Overview:")
print(df.info())
print(f"\n📈 First 5 rows:")
df.head()


🔧 Creating comprehensive test dataset...
✅ Dataset created and saved to data/raw/comprehensive_dataset.csv
   Shape: (1000, 9)
   Missing values: 1650
   Columns with missing data: {'age': 150, 'income': 100, 'score': 200, 'years_experience': 50, 'city': 80, 'education': 120, 'optional_field': 950}

📊 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               850 non-null    float64
 1   income            900 non-null    float64
 2   score             800 non-null    float64
 3   years_experience  950 non-null    float64
 4   rating            1000 non-null   float64
 5   city              920 non-null    object 
 6   department        1000 non-null   object 
 7   education         880 non-null    object 
 8   optional_field    50 non-null     float64
dtypes: float64(6), object(3)
memory usage: 70.4+ KB
None

Unnamed: 0,age,income,score,years_experience,rating,city,department,education,optional_field
0,40.96057,111244.342997,,8.0,2.071007,San Francisco,Finance,High School,
1,33.340828,76092.649284,,12.0,3.287596,Chicago,HR,,
2,,38089.894737,0.276197,10.0,4.858742,Chicago,HR,Bachelor,
3,53.276358,21643.286173,0.094254,8.0,4.502227,San Francisco,Marketing,Master,
4,32.19016,63486.25158,0.411845,11.0,4.503736,Chicago,Engineering,Master,


In [3]:
# Load and analyze the dataset
df_raw = pd.read_csv('data/raw/comprehensive_dataset.csv')
print(f"📊 Dataset loaded: {df_raw.shape}")

# Basic analysis
print(f"\n🔍 Missing Value Analysis:")
missing_counts = df_raw.isnull().sum()
missing_pct = (missing_counts / len(df_raw)) * 100
missing_summary = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Missing_Percentage': missing_pct
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print(missing_summary)

print(f"\n📈 Dataset Info:")
df_raw.info()

print(f"\n📋 First few rows:")
df_raw.head()


📊 Dataset loaded: (1000, 9)

🔍 Missing Value Analysis:
                  Missing_Count  Missing_Percentage
optional_field              950                95.0
score                       200                20.0
age                         150                15.0
education                   120                12.0
income                      100                10.0
city                         80                 8.0
years_experience             50                 5.0

📈 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               850 non-null    float64
 1   income            900 non-null    float64
 2   score             800 non-null    float64
 3   years_experience  950 non-null    float64
 4   rating            1000 non-null   float64
 5   city              920 non-null    object 
 6   department        1000 non

Unnamed: 0,age,income,score,years_experience,rating,city,department,education,optional_field
0,40.96057,111244.342997,,8.0,2.071007,San Francisco,Finance,High School,
1,33.340828,76092.649284,,12.0,3.287596,Chicago,HR,,
2,,38089.894737,0.276197,10.0,4.858742,Chicago,HR,Bachelor,
3,53.276358,21643.286173,0.094254,8.0,4.502227,San Francisco,Marketing,Master,
4,32.19016,63486.25158,0.411845,11.0,4.503736,Chicago,Engineering,Master,


In [4]:
# Import our custom cleaning module and test it
import sys
sys.path.append('src')

try:
    from cleaning import DataCleaner, fill_missing_median, fill_missing_mean, normalize_data, detect_outliers
    print("📦 ✅ Custom cleaning module imported successfully")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Creating basic cleaning functions inline...")
    
    # Basic inline functions if import fails
    def fill_missing_median(df, columns):
        df_clean = df.copy()
        for col in columns:
            if col in df.columns and df[col].dtype in ['int64', 'float64']:
                df_clean[col].fillna(df[col].median(), inplace=True)
        return df_clean

# Load the dataset
df_raw = pd.read_csv('data/raw/comprehensive_dataset.csv')
print(f"📊 Loaded dataset: {df_raw.shape}")
print(f"Missing values: {df_raw.isnull().sum().sum()}")

# Show basic statistics
print("\n📈 Dataset Info:")
print(df_raw.info())
print("\n📊 Missing data by column:")
missing_summary = df_raw.isnull().sum()
missing_summary[missing_summary > 0]


📦 ✅ Custom cleaning module imported successfully
📊 Loaded dataset: (1000, 9)
Missing values: 1650

📈 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               850 non-null    float64
 1   income            900 non-null    float64
 2   score             800 non-null    float64
 3   years_experience  950 non-null    float64
 4   rating            1000 non-null   float64
 5   city              920 non-null    object 
 6   department        1000 non-null   object 
 7   education         880 non-null    object 
 8   optional_field    50 non-null     float64
dtypes: float64(6), object(3)
memory usage: 70.4+ KB
None

📊 Missing data by column:


age                 150
income              100
score               200
years_experience     50
city                 80
education           120
optional_field      950
dtype: int64

In [5]:
# Import our custom cleaning module
import sys
sys.path.append('src')
from cleaning import DataCleaner, fill_missing_median, fill_missing_mean, normalize_data, detect_outliers

print("📦 Imported custom cleaning module")

# Load the dataset
df_raw = pd.read_csv('data/raw/comprehensive_dataset.csv')
print(f"📊 Loaded dataset: {df_raw.shape}")

# Analyze missing data patterns
missing_analysis = DataCleaner.analyze_missing_data(df_raw)
print(f"\n🔍 Missing Data Analysis:")
print(f"   Total missing values: {missing_analysis['total_missing']}")
print(f"   Columns with missing data: {missing_analysis['columns_with_missing']}")
print(f"\n📊 Missing percentages by column:")
for col, pct in missing_analysis['missing_percentages'].items():
    if pct > 0:
        print(f"   {col}: {pct:.1f}%")


📦 Imported custom cleaning module
📊 Loaded dataset: (1000, 9)

🔍 Missing Data Analysis:
   Total missing values: 1650
   Columns with missing data: ['age', 'income', 'score', 'years_experience', 'city', 'education', 'optional_field']

📊 Missing percentages by column:
   age: 15.0%
   income: 10.0%
   score: 20.0%
   years_experience: 5.0%
   city: 8.0%
   education: 12.0%
   optional_field: 95.0%


In [6]:
# Import our custom cleaning module
import sys
sys.path.append('src')
from cleaning import DataCleaner, fill_missing_median, fill_missing_mean, normalize_data, detect_outliers

print("📦 Imported custom cleaning module")

# Load the dataset
df_raw = pd.read_csv('data/raw/comprehensive_dataset.csv')
print(f"📊 Loaded dataset: {df_raw.shape}")

# Analyze missing data patterns
missing_analysis = DataCleaner.analyze_missing_data(df_raw)
print(f"\n🔍 Missing Data Analysis:")
print(f"   Total missing values: {missing_analysis['total_missing']}")
print(f"   Columns with missing data: {missing_analysis['columns_with_missing']}")
print(f"\n📊 Missing percentages by column:")
for col, pct in missing_analysis['missing_percentages'].items():
    if pct > 0:
        print(f"   {col}: {pct:.1f}%")


📦 Imported custom cleaning module
📊 Loaded dataset: (1000, 9)

🔍 Missing Data Analysis:
   Total missing values: 1650
   Columns with missing data: ['age', 'income', 'score', 'years_experience', 'city', 'education', 'optional_field']

📊 Missing percentages by column:
   age: 15.0%
   income: 10.0%
   score: 20.0%
   years_experience: 5.0%
   city: 8.0%
   education: 12.0%
   optional_field: 95.0%


In [7]:
# Generate Sample Dataset with Missing Values and Outliers
print("🔧 Creating comprehensive test dataset...")

# Set random seed for reproducibility
np.random.seed(42)

# Create a more comprehensive dataset
n_samples = 1000

# Generate base data
data = {
    # Numeric columns with different patterns
    'age': np.random.normal(35, 12, n_samples),
    'income': np.random.lognormal(10.5, 0.8, n_samples),
    'score': np.random.beta(2, 5, n_samples),
    'years_experience': np.random.poisson(8, n_samples),
    'rating': np.random.uniform(1, 5, n_samples),
    
    # Categorical columns
    'city': np.random.choice(['New York', 'San Francisco', 'Chicago', 'Austin', 'Boston'], n_samples, 
                            p=[0.3, 0.25, 0.2, 0.15, 0.1]),
    'department': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], n_samples),
    'education': np.random.choice(['Bachelor', 'Master', 'PhD', 'High School'], n_samples, 
                                 p=[0.4, 0.35, 0.15, 0.1])
}

# Create DataFrame
df = pd.DataFrame(data)

# Add some outliers intentionally
outlier_indices = np.random.choice(df.index, size=50, replace=False)
df.loc[outlier_indices[:25], 'income'] *= 5  # High income outliers
df.loc[outlier_indices[25:], 'age'] = np.random.uniform(80, 95, 25)  # Age outliers

# Introduce missing values with different patterns
missing_patterns = {
    'age': 0.15,  # 15% missing randomly
    'income': 0.10,  # 10% missing
    'score': 0.20,  # 20% missing
    'years_experience': 0.05,  # 5% missing
    'city': 0.08,  # 8% missing
    'education': 0.12  # 12% missing
}

for col, missing_rate in missing_patterns.items():
    missing_indices = np.random.choice(df.index, size=int(len(df) * missing_rate), replace=False)
    df.loc[missing_indices, col] = np.nan

# Add a column with mostly missing values
df['optional_field'] = np.nan
df.loc[np.random.choice(df.index, size=50, replace=False), 'optional_field'] = np.random.normal(100, 20, 50)

# Save to raw data
csv_path = os.path.join(raw_dir, 'comprehensive_dataset.csv')
df.to_csv(csv_path, index=False)

print(f"✅ Dataset created and saved to {csv_path}")
print(f"   Shape: {df.shape}")
print(f"   Missing values: {df.isnull().sum().sum()}")
print(f"   Columns with missing data: {df.isnull().sum()[df.isnull().sum() > 0].to_dict()}")

# Display basic info
print(f"\n📊 Dataset Overview:")
print(df.info())
print(f"\n📈 First 5 rows:")
df.head()


🔧 Creating comprehensive test dataset...
✅ Dataset created and saved to data/raw/comprehensive_dataset.csv
   Shape: (1000, 9)
   Missing values: 1650
   Columns with missing data: {'age': 150, 'income': 100, 'score': 200, 'years_experience': 50, 'city': 80, 'education': 120, 'optional_field': 950}

📊 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               850 non-null    float64
 1   income            900 non-null    float64
 2   score             800 non-null    float64
 3   years_experience  950 non-null    float64
 4   rating            1000 non-null   float64
 5   city              920 non-null    object 
 6   department        1000 non-null   object 
 7   education         880 non-null    object 
 8   optional_field    50 non-null     float64
dtypes: float64(6), object(3)
memory usage: 70.4+ KB
None

Unnamed: 0,age,income,score,years_experience,rating,city,department,education,optional_field
0,40.96057,111244.342997,,8.0,2.071007,San Francisco,Finance,High School,
1,33.340828,76092.649284,,12.0,3.287596,Chicago,HR,,
2,,38089.894737,0.276197,10.0,4.858742,Chicago,HR,Bachelor,
3,53.276358,21643.286173,0.094254,8.0,4.502227,San Francisco,Marketing,Master,
4,32.19016,63486.25158,0.411845,11.0,4.503736,Chicago,Engineering,Master,
