#**Data Generation**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set a seed for reproducibility
random.seed(42)
np.random.seed(42)

# --- Define Parameters ---
num_artists = 50
num_users = 10000
num_music_videos = 200
num_tour_dates = 500
num_merch_sales = 15000
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 8, 7)

# --- Generate Mock Artist and User IDs ---
artists = [f'ARTIST_{i:03d}' for i in range(num_artists)]
users = [f'USER_{i:05d}' for i in range(num_users)]
merchandise_ids = [f'MERCH_{i:04d}' for i in range(200)]

# --- Generate Mock Music Video Data ---
print("Generating music_videos.csv...")
video_data = {
    'video_id': [f'VIDEO_{i:04d}' for i in range(num_music_videos)],
    'artist_id': np.random.choice(artists, num_music_videos),
    'release_date': [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(num_music_videos)],
    'merch_worn_by_artist': np.random.choice([True, False], num_music_videos, p=[0.2, 0.8]),
    'merch_worn_by_key_player': np.random.choice([True, False], num_music_videos, p=[0.3, 0.7]),
    'video_views': np.random.randint(10000, 50000000, num_music_videos),
    'video_likes': np.random.randint(500, 1000000, num_music_videos),
    'video_comments': np.random.randint(50, 50000, num_music_videos)
}
music_videos_df = pd.DataFrame(video_data)
music_videos_df.to_csv('music_videos.csv', index=False)
print("music_videos.csv generated.")

# --- Generate Mock Tour Dates Data ---
print("Generating tour_dates.csv...")
cities = ['New York', 'Los Angeles', 'London', 'Tokyo', 'Paris', 'Berlin', 'Sydney', 'Toronto', 'Chicago', 'Mexico City']
tour_data = {
    'tour_id': [f'TOUR_{i:04d}' for i in range(num_tour_dates)],
    'artist_id': np.random.choice(artists, num_tour_dates, p=np.random.dirichlet(np.ones(num_artists))), # More popular artists tour more
    'tour_city': np.random.choice(cities, num_tour_dates),
    'tour_date': [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(num_tour_dates)],
    'is_headlining': np.random.choice([True, False], num_tour_dates, p=[0.6, 0.4])
}
tour_dates_df = pd.DataFrame(tour_data)
tour_dates_df.to_csv('tour_dates.csv', index=False)
print("tour_dates.csv generated.")

# --- Generate Mock Merchandise Sales Data ---
print("Generating merchandise_sales.csv...")
# Simulate higher sales after a video release or during a tour
sale_dates = []
for _ in range(num_merch_sales):
    if random.random() < 0.6: # 60% chance sale is linked to an event
        if random.random() < 0.5: # 50% chance it's video-related
            related_video = music_videos_df.sample(1).iloc[0]
            sale_date = related_video['release_date'] + timedelta(days=random.randint(0, 30))
        else: # 50% chance it's tour-related
            related_tour = tour_dates_df.sample(1).iloc[0]
            sale_date = related_tour['tour_date'] + timedelta(days=random.randint(-14, 14))
    else: # 40% chance it's a random sale
        sale_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    sale_dates.append(sale_date)

sales_data = {
    'sale_id': [f'SALE_{i:05d}' for i in range(num_merch_sales)],
    'user_id': np.random.choice(users, num_merch_sales),
    'artist_id': np.random.choice(artists, num_merch_sales),
    'merchandise_id': np.random.choice(merchandise_ids, num_merch_sales),
    'sale_date': sale_dates,
    'sale_price': np.random.uniform(15.0, 75.0, num_merch_sales).round(2)
}
merchandise_sales_df = pd.DataFrame(sales_data)
merchandise_sales_df.to_csv('merchandise_sales.csv', index=False)
print("merchandise_sales.csv generated.")

# --- Generate Mock Fan Engagement Data ---
print("Generating fan_engagement.csv...")
num_engagements = 100000
engagement_types = ['comment', 'share', 'like', 'listen']
user_locations = ['New York', 'Los Angeles', 'London', 'Tokyo', 'Paris']
user_demographics = ['18-24', '25-34', '35-44', '45+']

engagement_data = {
    'user_id': np.random.choice(users, num_engagements),
    'artist_id': np.random.choice(artists, num_engagements),
    'engagement_type': np.random.choice(engagement_types, num_engagements, p=[0.1, 0.05, 0.5, 0.35]),
    'engagement_date': [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(num_engagements)],
    'user_location': np.random.choice(user_locations, num_engagements),
    'user_demographics': np.random.choice(user_demographics, num_engagements)
}
fan_engagement_df = pd.DataFrame(engagement_data)
fan_engagement_df.to_csv('fan_engagement.csv', index=False)
print("fan_engagement.csv generated.")

print("\nAll mock data files have been successfully generated.")

Generating music_videos.csv...
music_videos.csv generated.
Generating tour_dates.csv...
tour_dates.csv generated.
Generating merchandise_sales.csv...
merchandise_sales.csv generated.
Generating fan_engagement.csv...
fan_engagement.csv generated.

All mock data files have been successfully generated.
