# PROXIMA: Data Exploration

This notebook explores the synthetic experiment data and validates the data generation process.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from proxima.generator.simulate import generate_synthetic_experiments

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## Generate Data

In [None]:
# Generate synthetic data
df = generate_synthetic_experiments(n_users=250_000, n_experiments=50, seed=7)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## Data Summary

In [None]:
print("=" * 80)
print("DATA SUMMARY")
print("=" * 80)
print(f"Total users: {len(df):,}")
print(f"Number of experiments: {df['exp_id'].nunique()}")
print(f"\nSegment distributions:")
print(f"  Regions: {df['region'].value_counts().to_dict()}")
print(f"  Devices: {df['device'].value_counts().to_dict()}")
print(f"  Tenure: {df['tenure'].value_counts().to_dict()}")
print(f"\nTreatment assignment:")
print(f"  Control: {(df['treatment'] == 0).sum():,} ({(df['treatment'] == 0).mean():.1%})")
print(f"  Treatment: {(df['treatment'] == 1).sum():,} ({(df['treatment'] == 1).mean():.1%})")
print(f"\nFailure cohort:")
print(f"  Size: {df['failure_cohort'].sum():,} ({df['failure_cohort'].mean():.1%})")
print(f"\nLong-term retention:")
print(f"  Retained: {df['long_retained'].sum():,} ({df['long_retained'].mean():.1%})")

## Visualize Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Early watch minutes
axes[0, 0].hist(df['early_watch_min'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Early Watch Minutes Distribution')
axes[0, 0].set_xlabel('Minutes')
axes[0, 0].set_ylabel('Frequency')

# Early starts
axes[0, 1].hist(df['early_starts'], bins=50, alpha=0.7, edgecolor='black', color='orange')
axes[0, 1].set_title('Early Starts Distribution')
axes[0, 1].set_xlabel('Starts')
axes[0, 1].set_ylabel('Frequency')

# CTR
axes[1, 0].hist(df['early_ctr'], bins=50, alpha=0.7, edgecolor='black', color='green')
axes[1, 0].set_title('Early CTR Distribution')
axes[1, 0].set_xlabel('CTR')
axes[1, 0].set_ylabel('Frequency')

# Rebuffer rate
axes[1, 1].hist(df['rebuffer_rate'], bins=50, alpha=0.7, edgecolor='black', color='red')
axes[1, 1].set_title('Rebuffer Rate Distribution')
axes[1, 1].set_xlabel('Rate')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Treatment vs Control Comparison

In [None]:
# Compare treatment vs control for long-term retention
retention_by_treatment = df.groupby('treatment')['long_retained'].agg(['mean', 'count'])
print("Long-term retention by treatment:")
print(retention_by_treatment)

# Visualize
fig, ax = plt.subplots(figsize=(8, 6))
retention_by_treatment['mean'].plot(kind='bar', ax=ax, color=['#3498db', '#e74c3c'])
ax.set_title('Long-term Retention: Control vs Treatment', fontsize=14, fontweight='bold')
ax.set_xlabel('Group')
ax.set_ylabel('Retention Rate')
ax.set_xticklabels(['Control', 'Treatment'], rotation=0)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Save Data

In [None]:
# Save for later use
df.to_csv('../outputs/synthetic_data.csv', index=False)
print("Data saved to outputs/synthetic_data.csv")