# Spotify Streaming Data Analysis



In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(42)

In [None]:
# Config
start_date = datetime(2025, 2, 6)
days = 365
n_streams = 11258

artists = [
    ("A1","Taylor Swift"),
    ("A2","Billie Eilish"),
    ("A3","Bad Bunny"),
    ("A4","Drake"),
    ("A5","The Weeknd"),
    ("A6","Kendrick Lamar"),
    ("A7","SZA"),
    ("A8","Post Malone"),
    ("A9","Tyler, The Creator"),
    ("A10","Olivia Rodrigo"),
]
genres = ["Pop","Hip-Hop","R&B","Alternative","Rock","Indie","Latin","Rap","Country","Dance",
          "Jazz Rap","Reggaeton","Neo-Soul","Psychedelic"]

artist_weights = np.array([0.12,0.10,0.10,0.09,0.09,0.08,0.08,0.07,0.07,0.06])
artist_weights = artist_weights / artist_weights.sum()

genre_map = {
    "Taylor Swift":"Pop", "Billie Eilish":"Pop", "Bad Bunny":"Latin", "Drake":"Hip-Hop",
    "The Weeknd":"R&B", "Kendrick Lamar":"Rap", "SZA":"R&B", "Post Malone":"Pop",
    "Tyler, The Creator":"Alternative", "Olivia Rodrigo":"Pop"
}

platforms = ["Mobile","Desktop","TV"]
platform_weights = [0.74, 0.20, 0.06]

countries = ["US","CA","GB","BD","MX","ES"]
country_weights = [0.55,0.10,0.10,0.08,0.10,0.07]

In [None]:
# Helper functions
def seasonal_multiplier(dt):
    m = dt.month
    if m in [12]: return 1.25
    if m in [6,7,8]: return 1.12
    if m in [1,2]: return 0.92
    return 1.0

def hour_weight(h):
    if 18 <= h <= 20: return 2.3
    if 21 <= h <= 23: return 1.6
    if 0 <= h <= 5: return 0.55
    if 6 <= h <= 8: return 1.3
    if 9 <= h <= 16: return 0.95
    return 1.0

In [None]:
timestamps, weights = [], []
for d in range(days):
    day = start_date + timedelta(days=d)
    for h in range(24):
        ts = day + timedelta(hours=h)
        w = seasonal_multiplier(ts) * hour_weight(h)
        timestamps.append(ts)
        weights.append(w)

weights = np.array(weights) / sum(weights)
chosen_ts = np.random.choice(len(timestamps), size=n_streams, p=weights)
played_at = [timestamps[i] + timedelta(minutes=np.random.randint(0,60), seconds=np.random.randint(0,60)) for i in chosen_ts]

artist_ids = np.random.choice([a[0] for a in artists], size=n_streams, p=artist_weights)
artist_lookup = dict(artists)
artist_names = [artist_lookup[a] for a in artist_ids]
genres_assigned = [genre_map.get(name, np.random.choice(genres)) for name in artist_names]

track_duration_ms = np.random.randint(120000, 260000, size=n_streams)
base_completion = np.random.beta(5, 1.8, size=n_streams)
ms_played = (track_duration_ms * base_completion).astype(int)

hour = [dt.hour for dt in played_at]
genre_skip_bias = { g: 0.2 for g in genres }
genre_skip_bias.update({"Pop":0.18,"R&B":0.17,"Alternative":0.23,"Country":0.24,"Psychedelic":0.26})
time_skip_adj = [0.03 if (0 <= h <= 5) else 0.00 for h in hour]
skip_prob = np.array([genre_skip_bias.get(g,0.21) for g in genres_assigned]) + np.array(time_skip_adj)
skip_prob = np.clip(skip_prob, 0.05, 0.45)
skipped = (np.random.rand(n_streams) < skip_prob).astype(int)
ms_played = np.where(skipped==1, (ms_played * np.random.uniform(0.15,0.55,size=n_streams)).astype(int), ms_played)
track_id = [f"T{np.random.randint(1000,9999)}" for _ in range(n_streams)]

fact_streams = pd.DataFrame({
    "stream_id": [f"S{i+1}" for i in range(n_streams)],
    "played_at": played_at,
    "track_id": track_id,
    "artist_id": artist_ids,
    "artist_name": artist_names,
    "genre": genres_assigned,
    "platform": np.random.choice(platforms, size=n_streams, p=platform_weights),
    "country": np.random.choice(countries, size=n_streams, p=country_weights),
    "track_duration_ms": track_duration_ms,
    "ms_played": ms_played,
    "skipped": skipped
})

In [None]:
dim_artist = pd.DataFrame(artists, columns=["artist_id","artist_name"]).drop_duplicates()

dim_track = fact_streams[["track_id","artist_id","genre"]].drop_duplicates()
dim_track["track_name"] = "Track " + dim_track["track_id"]

fact_streams["date"] = pd.to_datetime(fact_streams["played_at"]).dt.date
dim_date = pd.DataFrame({"date": pd.date_range(start=fact_streams["date"].min(),
                                               end=fact_streams["date"].max(), freq="D")})
dim_date["year"] = dim_date["date"].dt.year
dim_date["month"] = dim_date["date"].dt.month
dim_date["month_name"] = dim_date["date"].dt.strftime("%b")
dim_date["quarter"] = dim_date["date"].dt.quarter
dim_date["day_name"] = dim_date["date"].dt.strftime("%a")

fact_streams.to_csv("fact_streams.csv", index=False)
dim_track.to_csv("dim_track.csv", index=False)
dim_artist.to_csv("dim_artist.csv", index=False)
dim_date.to_csv("dim_date.csv", index=False)

print("✅ CSVs saved: fact_streams.csv, dim_track.csv, dim_artist.csv, dim_date.csv")

✅ CSVs saved: fact_streams.csv, dim_track.csv, dim_artist.csv, dim_date.csv
