In [2]:
!pip install pandas
!pip install numpy




In [4]:
!pip install datetime



In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define Latin American countries and top cities for simulation
latam_countries = ['MX', 'BR', 'AR', 'CO', 'CL']
latam_cities = {
    'MX': ['Mexico City', 'Guadalajara'],
    'BR': ['Sao Paulo', 'Rio de Janeiro'],
    'AR': ['Buenos Aires', 'Cordoba'],
    'CO': ['Bogota', 'Medellin'],
    'CL': ['Santiago', 'Valparaiso']
}

# Define a list of simulated artists and songs
artists = [f'Artist_{i}' for i in range(1, 21)]
songs = {artist: [f'Song_{i}' for i in range(1, 5)] for artist in artists}

# === 1. Synthetic Streaming Data ===
def generate_streaming_data():
    data = []
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)

    for _ in range(500000): # Simulating a large dataset
        artist = random.choice(artists)
        song = random.choice(songs[artist])
        country = random.choice(latam_countries)
        stream_count = random.randint(100, 50000)
        platform = random.choice(['Spotify', 'Apple Music', 'YouTube Music'])
        date = start_date + timedelta(days=random.randint(0, 364))
        isrc = f'US-XYZ-{random.randint(10000, 99999)}'

        data.append([
            f'track_{random.randint(100000, 999999)}',
            artist,
            song,
            country,
            stream_count,
            platform,
            date.strftime('%Y-%m-%d'),
            f'album_{random.randint(1, 100)}',
            isrc
        ])

    df = pd.DataFrame(data, columns=[
        'track_id', 'artist_name', 'song_name', 'country', 'stream_count', 'platform', 'date', 'album_id', 'isrc'
    ])
    df.to_csv('streaming_data.csv', index=False)
    print("Generated streaming_data.csv")

# === 2. Synthetic Airplay Data ===
def generate_airplay_data():
    data = []
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)

    for _ in range(50000):
        artist = random.choice(artists)
        song = random.choice(songs[artist])
        country = random.choice(latam_countries)
        city = random.choice(latam_cities[country])
        airplay_count = random.randint(1, 500)
        date = start_date + timedelta(days=random.randint(0, 364))

        data.append([
            f'track_{random.randint(100000, 999999)}',
            artist,
            song,
            f'station_{random.randint(1, 50)}',
            city,
            country,
            airplay_count,
            date.strftime('%Y-%m-%d')
        ])

    df = pd.DataFrame(data, columns=[
        'track_id', 'artist_name', 'song_name', 'station_id', 'city', 'country', 'airplay_count', 'airplay_date'
    ])
    df.to_csv('airplay_data.csv', index=False)
    print("Generated airplay_data.csv")

# === 3. Synthetic Social Media Data ===
def generate_social_data():
    data = []
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)

    for _ in range(5000):
        artist = random.choice(artists)
        platform = random.choice(['YouTube', 'Instagram', 'TikTok'])
        follower_count = random.randint(10000, 5000000)
        engagement_rate = random.uniform(0.01, 0.15)
        video_views = random.randint(10000, 10000000)
        comment_count = random.randint(100, 50000)
        date = start_date + timedelta(days=random.randint(0, 364))

        data.append([
            artist,
            platform,
            follower_count,
            engagement_rate,
            video_views,
            comment_count,
            date.strftime('%Y-%m-%d')
        ])

    df = pd.DataFrame(data, columns=[
        'artist_name', 'platform', 'follower_count', 'engagement_rate', 'video_views', 'comment_count', 'date'
    ])
    df.to_csv('social_media_data.csv', index=False)
    print("Generated social_media_data.csv")

# === 4. Synthetic Publishing/PRO Data ===
def generate_publishing_data():
    data = []
    publishing_companies = ['Kobalt', 'Universal', 'Sony']
    pro_affiliations = ['BMI', 'ASCAP', 'SACM']

    for artist in artists:
        for song in songs[artist]:
            publisher = random.choice(publishing_companies)
            pro = random.choice(pro_affiliations)
            sw_credits = f'{artist}, Co-writer_{random.randint(1, 3)}'
            percent_share = random.uniform(0.2, 0.8)

            data.append([
                artist,
                song,
                publisher,
                pro,
                sw_credits,
                percent_share
            ])

    df = pd.DataFrame(data, columns=[
        'artist_name', 'song_name', 'publisher_affiliation', 'pro_affiliation', 'songwriting_credits', 'percent_share'
    ])
    df.to_csv('publishing_data.csv', index=False)
    print("Generated publishing_data.csv")

if __name__ == '__main__':
    generate_streaming_data()
    generate_airplay_data()
    generate_social_data()
    generate_publishing_data()

Generated streaming_data.csv
Generated airplay_data.csv
Generated social_media_data.csv
Generated publishing_data.csv
