In [1]:
# Import necessary libraries for data generation
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# --- Rationale for data generation ---
# The goal is to create a realistic simulation of Spotify's data ecosystem for a new AI feature.
# We are generating three key datasets to represent:
# 1. 'user_behavior.csv': Standard user interaction logs (streams, skips).
# 2. 'gen_shuffle_events.csv': Logs specific to the new generative AI feature, including commentary text and user feedback.
# 3. 'qualitative_ratings.csv': A small, curated dataset to simulate a "human-in-the-loop" or "LLM-as-a-judge" evaluation of the AI's output.
# This approach allows us to demonstrate a comprehensive evaluation strategy.

# Define parameters for the simulation to ensure consistency and scale.
NUM_USERS = 1000
NUM_TRACKS = 5000
NUM_PLAYLISTS = 200
NUM_DAYS = 30
START_DATE = datetime(2025, 7, 1)

# Generate unique identifiers for our simulated entities.
users = [f'user_{i:04d}' for i in range(NUM_USERS)]
tracks = [f'track_{i:05d}' for i in range(NUM_TRACKS)]
playlists = [f'playlist_{i:03d}' for i in range(NUM_PLAYLISTS)]

# --- Generating the `user_behavior` dataset ---
# This dataset captures core user interactions with music. It's the baseline for our analysis.
user_behavior_data = []
for _ in range(250000): # Simulating a large number of user events for a realistic scale.
    user_id = random.choice(users)
    playlist_id = random.choice(playlists)
    track_id = random.choice(tracks)

    # Assign a random timestamp within our simulated 30-day period.
    timestamp = START_DATE + timedelta(days=random.randint(0, NUM_DAYS-1),
                                        hours=random.randint(0, 23),
                                        minutes=random.randint(0, 59))

    # Simulate realistic stream durations and a skip rate, which are key metrics.
    stream_duration = np.random.randint(30, 240)
    skip = np.random.choice([0, 1], p=[0.7, 0.3]) # 30% skip rate is a common industry benchmark.

    # Event type for clarity and future analysis.
    event_type = 'stream_play'

    user_behavior_data.append([user_id, playlist_id, track_id, timestamp, stream_duration, skip, event_type])

# Convert the list of data into a pandas DataFrame.
user_behavior_df = pd.DataFrame(user_behavior_data, columns=['user_id', 'playlist_id', 'track_id', 'event_timestamp', 'stream_duration_sec', 'skipped_flag', 'event_type'])
user_behavior_df.to_csv('user_behavior.csv', index=False)
print("✅ Successfully generated 'user_behavior.csv' with 250,000 user interaction logs.")

# --- Generating the `gen_shuffle_events` dataset ---
# This dataset specifically logs events related to the new AI-powered feature.
gen_shuffle_events_data = []
# Pre-defined commentary templates to simulate the AI's output.
commentary_templates = [
    "Next up, we're shifting gears from that indie rock vibe to a classic throwback. This one's a fan favorite.",
    "Following that high-energy track, I've got something to help you wind down. Perfect for a chill night in.",
    "Did you know this artist's first single was a viral sensation? Here's the track that put them on the map.",
    "Based on your recent listening, I'm predicting you'll love this song. Let's see if I'm right!",
    "This track features some unique production that you might have missed. Listen for the subtle synth line in the chorus."
]

for _ in range(50000): # Simulating a large number of AI-driven events.
    user_id = random.choice(users)
    timestamp = START_DATE + timedelta(days=random.randint(0, NUM_DAYS-1),
                                        hours=random.randint(0, 23),
                                        minutes=random.randint(0, 59))

    # We are simulating different event types to capture various user interactions.
    event_type = random.choice(['commentary_start', 'user_feedback_positive', 'user_feedback_negative'])

    commentary_text = ""
    if event_type == 'commentary_start':
        commentary_text = random.choice(commentary_templates)

    gen_shuffle_events_data.append([user_id, timestamp, event_type, commentary_text])

gen_shuffle_events_df = pd.DataFrame(gen_shuffle_events_data, columns=['user_id', 'event_timestamp', 'event_type', 'ai_commentary_text'])
gen_shuffle_events_df.to_csv('gen_shuffle_events.csv', index=False)
print("✅ Successfully generated 'gen_shuffle_events.csv' with 50,000 AI feature events.")

# --- Generating the `qualitative_ratings` dataset ---
# This dataset is crucial for the qualitative evaluation of the AI model.
qualitative_ratings_data = []
# A small set of sample commentaries, including some that might be flagged for quality issues.
sample_commentaries = random.sample(commentary_templates, 3) + ["This is a test of the emergency broadcast system.", "The artist's name is actually spelled with a 'Y', not an 'I'."]
for commentary in sample_commentaries:
    # Simulating ratings from a human rater and an LLM-as-a-judge.
    human_coherence = np.random.choice([1, 2, 3, 4, 5])
    llm_coherence = np.random.choice([1, 2, 3, 4, 5])
    safety_flag = np.random.choice([0, 1], p=[0.95, 0.05]) # Simulate a low rate of safety issues.

    qualitative_ratings_data.append([commentary, human_coherence, llm_coherence, safety_flag])

qualitative_ratings_df = pd.DataFrame(qualitative_ratings_data, columns=['commentary_text', 'human_coherence_rating', 'llm_coherence_rating', 'safety_flag'])
qualitative_ratings_df.to_csv('qualitative_ratings.csv', index=False)
print("✅ Successfully generated 'qualitative_ratings.csv' for AI model quality assessment.")

✅ Successfully generated 'user_behavior.csv' with 250,000 user interaction logs.
✅ Successfully generated 'gen_shuffle_events.csv' with 50,000 AI feature events.
✅ Successfully generated 'qualitative_ratings.csv' for AI model quality assessment.
