In [1]:
# data_generation.ipynb
# This notebook simulates the data needed for the causal inference project.
# The data mimics a real-world scenario where a policy change affects a group of users.

# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- Configuration ---
# Set a seed for reproducibility
np.random.seed(42)

# Define the period for data generation
start_date = datetime(2025, 5, 1)
end_date = datetime(2025, 9, 30)
policy_change_date = datetime(2025, 7, 1)

# Number of DJs in our simulation
n_djs = 100
n_treatment = 50
n_control = 50

# --- Generate DJ Profile Data ---
# A DataFrame to hold static DJ information
dj_profile_data = {
    'dj_id': [f'DJ_{i}' for i in range(n_djs)],
    'is_treatment_group': [True] * n_treatment + [False] * n_control,
    'is_commercial_user': np.random.choice([True, False], size=n_djs, p=[0.2, 0.8]),
    'initial_follower_count': np.random.randint(5000, 500000, size=n_djs)
}
dj_profiles = pd.DataFrame(dj_profile_data)

# --- Simulate Livestream Metrics Data ---
livestream_records = []
current_date = start_date

while current_date <= end_date:
    for _, dj in dj_profiles.iterrows():
        # Simulate a certain number of streams per day for each DJ
        num_streams_today = np.random.poisson(lam=1.5) # Simulate ~1-2 streams per DJ per day

        for _ in range(num_streams_today):
            # Base metrics for pre-policy period
            base_avg_duration = 30
            base_total_viewers = 10000

            # Simulate the effect of the policy
            if current_date >= policy_change_date:
                if dj['is_treatment_group']:
                    # Treatment group (DJs using copyrighted music) sees a decline
                    avg_duration_multiplier = np.random.uniform(0.4, 0.6) # 40-60% drop
                    viewer_multiplier = np.random.uniform(0.6, 0.8) # 60-80% drop

                    # Assume they switch to 'Original' music as per the policy
                    music_source = 'Original'
                else:
                    # Control group (DJs using licensed/original music) sees a slight, random fluctuation
                    avg_duration_multiplier = np.random.uniform(0.9, 1.1)
                    viewer_multiplier = np.random.uniform(0.9, 1.1)

                    # Music source remains the same
                    music_source = 'Licensed' if np.random.rand() > 0.5 else 'Original'
            else:
                # Before the policy, both groups had similar, stable metrics
                avg_duration_multiplier = np.random.uniform(0.95, 1.05)
                viewer_multiplier = np.random.uniform(0.95, 1.05)

                # Treatment group primarily used copyrighted music pre-policy
                if dj['is_treatment_group']:
                    music_source = 'Copyrighted'
                else:
                    music_source = 'Licensed' if np.random.rand() > 0.5 else 'Original'

            # Add some random noise
            avg_duration = base_avg_duration * avg_duration_multiplier * np.random.uniform(0.9, 1.1)
            total_viewers = int(base_total_viewers * viewer_multiplier * np.random.uniform(0.9, 1.1))

            livestream_records.append({
                'livestream_id': f'LS_{len(livestream_records)}_{dj["dj_id"]}',
                'dj_id': dj['dj_id'],
                'date': current_date.strftime('%Y-%m-%d'),
                'total_viewers': total_viewers,
                'avg_view_duration_minutes': avg_duration,
                'music_source': music_source,
                'is_treatment_group': dj['is_treatment_group'],
                'is_commercial_user': dj['is_commercial_user'],
                'follower_count_at_stream': dj['initial_follower_count']
            })

    current_date += timedelta(days=1)

# Create the main DataFrame
livestream_metrics_df = pd.DataFrame(livestream_records)

# --- Save to CSV ---
livestream_metrics_df.to_csv('dj_livestream_metrics.csv', index=False)
dj_profiles.to_csv('dj_profile_metrics.csv', index=False)

print("Mock data generation complete. Two files have been created:")
print("- dj_livestream_metrics.csv")
print("- dj_profile_metrics.csv")

Mock data generation complete. Two files have been created:
- dj_livestream_metrics.csv
- dj_profile_metrics.csv
