In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#Loads the saved data:
print("Loading raw data...")
df = pd.read_csv('../data/raw/california_housing_raw.csv')
print(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Create a copy for processing
df_clean = df.copy()

print("\n" + "="*70)
print("STEP 1: Checking for Missing Values")
print("="*70)
missing_count = df_clean.isnull().sum()
print(missing_count)
if missing_count.sum() == 0:
    print("✓ No missing values found!")

print("\n" + "="*70)
print("STEP 2: Checking for Outliers")
print("="*70)

# Function to detect outliers using IQR method
#Some houses are very expensive (outliers)
def detect_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# Check outliers for each numeric column
outlier_summary = {}
for col in df_clean.columns:
    count, lower, upper = detect_outliers(df_clean, col)
    outlier_summary[col] = count
    print(f"{col}: {count} outliers detected (range: {lower:.2f} to {upper:.2f})")

print("\n" + "="*70)
print("STEP 3: Handling Outliers")
print("="*70)
print("Decision: Keep outliers for now as they represent real data")
print("Note: Median house values are capped at $500k in this dataset")
print("This is a known limitation, not an error")

print("\n" + "="*70)
print("STEP 4: Feature Engineering")
print("="*70)

# Create new features that might be useful
# Rooms per household
#Grading requirement: "Create derived features"
df_clean['RoomsPerHousehold'] = df_clean['AveRooms'] / df_clean['AveOccup']

# Bedrooms per household
df_clean['BedroomsPerHousehold'] = df_clean['AveBedrms'] / df_clean['AveOccup']

# Population per household
df_clean['PopulationPerHousehold'] = df_clean['Population'] / df_clean['HouseAge']

print("Created new features:")
print("1. RoomsPerHousehold")
print("2. BedroomsPerHousehold") 
print("3. PopulationPerHousehold")

# Check for any infinite or NaN values after division
print("\nChecking for invalid values after feature engineering...")
print(f"Infinite values: {np.isinf(df_clean).sum().sum()}")
print(f"NaN values: {df_clean.isnull().sum().sum()}")

# Replace any infinite values with median
for col in ['RoomsPerHousehold', 'BedroomsPerHousehold', 'PopulationPerHousehold']:
    if np.isinf(df_clean[col]).any():
        df_clean[col].replace([np.inf, -np.inf], df_clean[col].median(), inplace=True)
        print(f"Replaced infinite values in {col}")

print("\n" + "="*70)
print("STEP 5: Data Type Verification")
print("="*70)
print(df_clean.dtypes)
print("\n✓ All columns are numeric - no conversion needed")

print("\n" + "="*70)
print("STEP 6: Final Dataset Summary")
print("="*70)
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_clean.shape}")
print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")
print(f"Features added: {df_clean.shape[1] - df.shape[1]}")

print("\nCleaned dataset columns:")
print(df_clean.columns.tolist())

print("\n" + "="*70)
print("STEP 7: Saving Processed Data")
print("="*70)

# Save the cleaned data
df_clean.to_csv('../data/processed/california_housing_clean.csv', index=False)
print("✓ Cleaned data saved to: ../data/processed/california_housing_clean.csv")

# Also create a data quality report
os.makedirs('../reports', exist_ok=True)
with open('../reports/data_quality_report.txt', 'w') as f:
    f.write("California Housing Dataset - Data Quality Report\n")
    f.write("="*70 + "\n\n")
    f.write(f"Date: January 2026\n\n")
    f.write(f"Original Dataset:\n")
    f.write(f"  - Rows: {df.shape[0]}\n")
    f.write(f"  - Columns: {df.shape[1]}\n\n")
    f.write(f"Cleaning Steps Performed:\n")
    f.write(f"  1. Checked for missing values: None found\n")
    f.write(f"  2. Analyzed outliers: Kept as they represent real data\n")
    f.write(f"  3. Created 3 new features through feature engineering\n")
    f.write(f"  4. Handled infinite values from calculations\n\n")
    f.write(f"Final Dataset:\n")
    f.write(f"  - Rows: {df_clean.shape[0]}\n")
    f.write(f"  - Columns: {df_clean.shape[1]}\n\n")
    f.write(f"Outlier Summary:\n")
    for col, count in outlier_summary.items():
        f.write(f"  - {col}: {count} outliers\n")

print("✓ Data quality report saved to: ../reports/data_quality_report.txt")

print("\n" + "="*70)
print("Data preprocessing complete! Ready for EDA.")
print("="*70)

Loading raw data...
Data loaded: 20640 rows, 9 columns

STEP 1: Checking for Missing Values
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64
✓ No missing values found!

STEP 2: Checking for Outliers
MedInc: 681 outliers detected (range: -0.71 to 8.01)
HouseAge: 0 outliers detected (range: -10.50 to 65.50)
AveRooms: 511 outliers detected (range: 2.02 to 8.47)
AveBedrms: 1424 outliers detected (range: 0.87 to 1.24)
Population: 1196 outliers detected (range: -620.00 to 3132.00)
AveOccup: 711 outliers detected (range: 1.15 to 4.56)
Latitude: 0 outliers detected (range: 28.26 to 43.38)
Longitude: 0 outliers detected (range: -127.48 to -112.33)
MedHouseVal: 1071 outliers detected (range: -0.98 to 4.82)

STEP 3: Handling Outliers
Decision: Keep outliers for now as they represent real data
Note: Median house values are capped at $500k in this dataset
This is a known limitation, 