# 02 - Data Preprocessing

Data cleaning, feature engineering, and preparation for anomaly detection.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_raw_data, save_processed_data
from src.preprocessing import (
    handle_missing_values,
    encode_categorical_features,
    extract_features,
    scale_features,
    preprocess_data
)

sns.set_style('whitegrid')
%matplotlib inline

## Load Raw Data

In [None]:
df_raw = load_raw_data()
print(f"Raw data shape: {df_raw.shape}")
df_raw.head()

## Handle Missing Values

In [None]:
# Check missing values
missing_before = df_raw.isnull().sum().sum()
print(f"Total missing values: {missing_before}")

# Handle missing values (choose strategy: 'drop', 'mean', 'median', 'mode')
df_clean = handle_missing_values(df_raw, strategy='drop')

missing_after = df_clean.isnull().sum().sum()
print(f"\nAfter handling:")
print(f"  Total missing values: {missing_after}")
print(f"  Records remaining: {len(df_clean)} ({len(df_clean)/len(df_raw)*100:.1f}%)")

## Feature Engineering

In [None]:
# Extract and engineer features
df_features = extract_features(df_clean)

print(f"Features added:")
new_cols = set(df_features.columns) - set(df_clean.columns)
for col in new_cols:
    print(f"  - {col}")

In [None]:
# Visualize engineered features
if 'medication_count' in df_features.columns:
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    df_features['medication_count'].hist(bins=20, edgecolor='black')
    plt.title('Distribution of Medication Count')
    plt.xlabel('Number of Medications')
    plt.ylabel('Frequency')
    
    if 'diagnosis_count' in df_features.columns:
        plt.subplot(1, 2, 2)
        df_features['diagnosis_count'].hist(bins=20, edgecolor='black')
        plt.title('Distribution of Diagnosis Count')
        plt.xlabel('Number of Diagnoses')
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## Encode Categorical Features

In [None]:
# Encode categorical variables
df_encoded, encoders = encode_categorical_features(df_features)

print(f"Encoded {len(encoders)} categorical features:")
for col in list(encoders.keys())[:5]:  # Show first 5
    print(f"  - {col}")

## Feature Scaling

In [None]:
# Scale numerical features
df_scaled, scaler = scale_features(df_encoded)

print("Features scaled using StandardScaler")
print(f"Final preprocessed data shape: {df_scaled.shape}")

## Complete Preprocessing Pipeline

In [None]:
# Run full preprocessing pipeline
df_processed, artifacts = preprocess_data(df_raw, handle_missing='drop', scale=True)

print("\n=== Preprocessing Complete ===")
print(f"Original data: {df_raw.shape}")
print(f"Processed data: {df_processed.shape}")
print(f"\nArtifacts saved: {list(artifacts.keys())}")

## Save Processed Data

In [None]:
# Save to data/processed/
save_processed_data(df_processed, filename='processed_data.csv')

# Also save artifacts for later use
import pickle
from pathlib import Path

artifacts_path = Path('..') / 'data' / 'processed' / 'preprocessing_artifacts.pkl'
with open(artifacts_path, 'wb') as f:
    pickle.dump(artifacts, f)
    
print(f"Saved preprocessing artifacts to: {artifacts_path}")

## Summary Statistics

In [None]:
# Final data summary
df_processed.describe()

## Next Steps

Proceed to `03_anomaly_detection.ipynb` to apply anomaly detection algorithms.