# Data Drift Simulation Framework

This notebook implements comprehensive data drift simulation scenarios for testing model monitoring systems. The framework generates controlled drift conditions to validate monitoring capabilities and establish performance baselines.

In [None]:
import pandas as pd
import numpy as np
import sys
import os

# add src directory to path
sys.path.append('../src')

from drift_simulation import create_drift_scenario

## Baseline Data Loading

Load the preprocessed baseline dataset that serves as the reference distribution for drift simulation and comparison analysis.

In [None]:
# load processed data
baseline_data = pd.read_csv('../data/processed/housing_baseline.csv')
print(f"baseline data shape: {baseline_data.shape}")
baseline_data.head()

## Feature Drift Scenario Generation

Create controlled drift scenarios using systematic feature transformations to simulate real-world data distribution changes. These scenarios enable comprehensive testing of monitoring system sensitivity and accuracy.

In [None]:
# scenario 1: gradual income shift
drift_data_1 = create_drift_scenario(
    baseline_data, 
    drift_type='feature_shift',
    feature='MedInc',
    shift_magnitude=0.5
)

print("original median income:")
print(baseline_data['MedInc'].describe())
print("\ndrifted median income:")
print(drift_data_1['MedInc'].describe())

In [None]:
# scenario 2: house age distribution change
drift_data_2 = create_drift_scenario(
    baseline_data,
    drift_type='distribution_change', 
    feature='HouseAge',
    shift_magnitude=0.3
)

print("original house age distribution:")
print(baseline_data['HouseAge'].describe())
print("\ndrifted house age distribution:")
print(drift_data_2['HouseAge'].describe())

## Drift Scenario Persistence

Save generated drift scenarios for downstream monitoring analysis and reproducible experimental workflows.

In [None]:
# save drift scenarios
drift_data_1.to_csv('../data/processed/housing_drift_scenario_1.csv', index=False)
drift_data_2.to_csv('../data/processed/housing_drift_scenario_2.csv', index=False)

print("drift scenarios saved successfully")

## Drift Visualization Analysis

Generate comparative visualizations to illustrate the magnitude and characteristics of induced drift across different features and target distributions.

In [None]:
import matplotlib.pyplot as plt

# compare distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# median income comparison
axes[0,0].hist(baseline_data['MedInc'], alpha=0.7, label='baseline', bins=30)
axes[0,0].hist(drift_data_1['MedInc'], alpha=0.7, label='drift scenario 1', bins=30)
axes[0,0].set_title('median income distribution')
axes[0,0].legend()

# house age comparison  
axes[0,1].hist(baseline_data['HouseAge'], alpha=0.7, label='baseline', bins=30)
axes[0,1].hist(drift_data_2['HouseAge'], alpha=0.7, label='drift scenario 2', bins=30)
axes[0,1].set_title('house age distribution')
axes[0,1].legend()

# target comparison for scenario 1
axes[1,0].hist(baseline_data['MedHouseVal'], alpha=0.7, label='baseline target', bins=30)
axes[1,0].hist(drift_data_1['MedHouseVal'], alpha=0.7, label='scenario 1 target', bins=30)
axes[1,0].set_title('target distribution - scenario 1')
axes[1,0].legend()

# target comparison for scenario 2
axes[1,1].hist(baseline_data['MedHouseVal'], alpha=0.7, label='baseline target', bins=30)
axes[1,1].hist(drift_data_2['MedHouseVal'], alpha=0.7, label='scenario 2 target', bins=30)
axes[1,1].set_title('target distribution - scenario 2')
axes[1,1].legend()

plt.tight_layout()
plt.show()