In [2]:
import pandas as pd
import json
import glob
import os


In [6]:
data_path = os.path.join("..", "data", "*.json")
files = glob.glob(data_path)
files


['../data/Streaming_History_Audio_2024-2025_1.json',
 '../data/Streaming_History_Video_2023-2025.json',
 '../data/Streaming_History_Audio_2025_2.json',
 '../data/Streaming_History_Audio_2021-2024_0.json']

In [7]:
dfs = []

for file in files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        dfs.append(pd.DataFrame(data))

df = pd.concat(dfs, ignore_index=True)
df.head()


Unnamed: 0,ts,platform,ms_played,conn_country,ip_addr,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,...,audiobook_uri,audiobook_chapter_uri,audiobook_chapter_title,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2024-07-06T13:38:48Z,ios,4899,GB,92.40.218.69,Ogo,Seyi Vibez,Ogo,spotify:track:1zM2WWrMOt5Jt2jR39cBDp,,...,,,,clickrow,endplay,True,True,False,1720271000.0,False
1,2024-07-06T13:40:28Z,ios,99822,GB,92.40.218.69,C****l Bag,Rsko,Memory,spotify:track:1iglyuVDpikOnQHULBu1ss,,...,,,,clickrow,endplay,True,True,True,1720273000.0,False
2,2024-07-06T13:40:29Z,ios,1416,GB,92.40.218.69,La vie d'un... / Ma préférée,Dadju,HÉRITAGE,spotify:track:3IedXlFglIwo754rxOID4x,,...,,,,clickrow,endplay,True,True,True,1720273000.0,False
3,2024-07-06T13:41:35Z,ios,65131,GB,92.40.218.69,Inviolable,Popcaan,Inviolable - Single,spotify:track:6hix0bbbr6iSUfPlTGwlTv,,...,,,,clickrow,endplay,True,True,True,1720273000.0,False
4,2024-07-06T13:44:30Z,ios,177000,GB,92.40.218.69,Ogo,Seyi Vibez,Ogo,spotify:track:1zM2WWrMOt5Jt2jR39cBDp,,...,,,,clickrow,trackdone,True,False,True,1720273000.0,False


In [20]:
df_clean = df.rename(columns={
    'ts': 'timestamp',
    'master_metadata_album_artist_name': 'artist_name',
    'master_metadata_track_name': 'track_name',
    'master_metadata_album_album_name': 'album_name'
})

df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'])

df_clean['month'] = df_clean['timestamp'].dt.to_period('M')

df_clean = df_clean[df_clean['artist_name'].notnull()]

monthly = (
    df_clean.groupby(['month', 'artist_name'])['ms_played']
    .sum()
    .reset_index()
)

monthly.head()


  df_clean['month'] = df_clean['timestamp'].dt.to_period('M')


Unnamed: 0,month,artist_name,ms_played
0,2021-07,Dave,51754
1,2023-02,Cruel Santino,110313
2,2023-02,DJ Kamol 2,49408
3,2023-02,Kid MARLEY,53290
4,2023-02,Poco Lee,8042


In [21]:
top_artists = (
    monthly.groupby('artist_name')['ms_played']
    .sum()
    .sort_values(ascending=False)
    .head(20)          # Top 20 biggest
    .index
)

top_artists


Index(['Burna Boy', 'Wizkid', 'Tiakola', 'Chris Brown', 'Fally Ipupa', 'Rsko',
       'AMARIA BB', 'Seyi Vibez', 'Rema', 'Aya Nakamura', 'Ya Levis', 'J Hus',
       'Azanti', 'Oxlade', 'Jacquees', 'Haile', 'Vianni', 'Tory Lanez',
       'Eric IV', 'Merveille'],
      dtype='object', name='artist_name')

In [23]:
pip install prophet --no-build-isolation


Collecting prophet
  Downloading prophet-1.2.1-py3-none-macosx_10_11_x86_64.whl.metadata (3.5 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Downloading cmdstanpy-1.3.0-py3-none-any.whl.metadata (4.2 kB)
Collecting holidays<1,>=0.25 (from prophet)
  Downloading holidays-0.86-py3-none-any.whl.metadata (50 kB)
Collecting tqdm>=4.36.1 (from prophet)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting importlib_resources (from prophet)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy>=1.0.4->prophet)
  Downloading stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Downloading prophet-1.2.1-py3-none-macosx_10_11_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m10.6 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hDownloading holidays-0.86-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m 

In [26]:
from prophet import Prophet
import numpy as np

def forecast_artist(artist_name, monthly_df):
    # Extract data for the artist
    artist_df = monthly_df[monthly_df['artist_name'] == artist_name].copy()
    
    # rename for Prophet
    artist_df = artist_df.rename(columns={'month': 'ds', 'ms_played': 'y'})
    
    # Prophet requires actual datetime, not Period
    artist_df['ds'] = artist_df['ds'].astype(str)
    
    # Train Prophet model
    m = Prophet()
    m.fit(artist_df)
    
    # Predict for next 12 months
    future = m.make_future_dataframe(periods=12, freq='M')
    forecast = m.predict(future)
    
    # isolate the future months
    last = artist_df['ds'].max()
    future_pred = forecast[forecast['ds'] > last]
    
    # return summed prediction
    total_pred_ms = future_pred['yhat'].sum()
    return total_pred_ms


In [28]:
predictions = []

for artist in top_artists:
    try:
        total_future_ms = forecast_artist(artist, monthly)
        predictions.append((artist, total_future_ms))
    except Exception as e:
        print(f"Error for {artist}: {e}")

pred_df = pd.DataFrame(predictions, columns=['artist_name', 'predicted_ms'])
pred_df = pred_df.sort_values(by='predicted_ms', ascending=False)
pred_df


16:48:31 - cmdstanpy - INFO - Chain [1] start processing
16:48:31 - cmdstanpy - INFO - Chain [1] done processing
16:48:31 - cmdstanpy - INFO - Chain [1] start processing
16:48:31 - cmdstanpy - INFO - Chain [1] done processing
16:48:31 - cmdstanpy - INFO - Chain [1] start processing
16:48:31 - cmdstanpy - INFO - Chain [1] done processing
16:48:32 - cmdstanpy - INFO - Chain [1] start processing
16:48:32 - cmdstanpy - INFO - Chain [1] done processing
16:48:32 - cmdstanpy - INFO - Chain [1] start processing
16:48:32 - cmdstanpy - INFO - Chain [1] done processing
16:48:32 - cmdstanpy - INFO - Chain [1] start processing
16:48:32 - cmdstanpy - INFO - Chain [1] done processing
16:48:32 - cmdstanpy - INFO - Chain [1] start processing
16:48:32 - cmdstanpy - INFO - Chain [1] done processing
16:48:33 - cmdstanpy - INFO - Chain [1] start processing
16:48:33 - cmdstanpy - INFO - Chain [1] done processing
16:48:33 - cmdstanpy - INFO - Chain [1] start processing
16:48:33 - cmdstanpy - INFO - Chain [1]

Unnamed: 0,artist_name,predicted_ms
11,J Hus,298821700.0
2,Tiakola,93089820.0
6,AMARIA BB,72744970.0
1,Wizkid,62055530.0
5,Rsko,34302510.0
9,Aya Nakamura,33692590.0
10,Ya Levis,30535260.0
8,Rema,21588180.0
4,Fally Ipupa,19158000.0
15,Haile,17826820.0


In [29]:
predicted_total_ms_2026 = pred_df['predicted_ms'].sum()

predicted_minutes_2026 = predicted_total_ms_2026 / (1000 * 60)
predicted_minutes_2026


11135.936611116187

In [30]:
monthly_tracks = (
    df_clean.groupby(['month', 'track_name'])['ms_played']
    .sum()
    .reset_index()
)

top_tracks = (
    monthly_tracks.groupby('track_name')['ms_played']
    .sum()
    .sort_values(ascending=False)
    .head(30)
    .index
)


In [31]:
def forecast_track(track_name, monthly_tracks_df):
    track_df = monthly_tracks_df[monthly_tracks_df['track_name'] == track_name].copy()
    track_df = track_df.rename(columns={'month': 'ds', 'ms_played': 'y'})
    track_df['ds'] = track_df['ds'].astype(str)
    
    m = Prophet()
    m.fit(track_df)
    
    future = m.make_future_dataframe(periods=12, freq='M')
    forecast = m.predict(future)
    
    last = track_df['ds'].max()
    future_pred = forecast[forecast['ds'] > last]
    
    return future_pred['yhat'].clip(lower=0).sum()

track_predictions = []

for track in top_tracks:
    total_future_ms = forecast_track(track, monthly_tracks)
    track_predictions.append((track, total_future_ms))

track_df = pd.DataFrame(track_predictions, columns=['track_name','predicted_ms'])
track_df = track_df.sort_values(by='predicted_ms', ascending=False)
track_df.head(10)


17:00:31 - cmdstanpy - INFO - Chain [1] start processing
17:00:31 - cmdstanpy - INFO - Chain [1] done processing
17:00:32 - cmdstanpy - INFO - Chain [1] start processing
17:00:32 - cmdstanpy - INFO - Chain [1] done processing
17:00:32 - cmdstanpy - INFO - Chain [1] start processing
17:00:32 - cmdstanpy - INFO - Chain [1] done processing
17:00:32 - cmdstanpy - INFO - Chain [1] start processing
17:00:32 - cmdstanpy - INFO - Chain [1] done processing
17:00:32 - cmdstanpy - INFO - Chain [1] start processing
17:00:36 - cmdstanpy - INFO - Chain [1] done processing
17:00:36 - cmdstanpy - INFO - Chain [1] start processing
17:00:36 - cmdstanpy - INFO - Chain [1] done processing
17:00:36 - cmdstanpy - INFO - Chain [1] start processing
17:00:37 - cmdstanpy - INFO - Chain [1] done processing
17:00:37 - cmdstanpy - INFO - Chain [1] start processing
17:00:37 - cmdstanpy - INFO - Chain [1] done processing
17:00:37 - cmdstanpy - INFO - Chain [1] start processing
17:00:37 - cmdstanpy - INFO - Chain [1]

Unnamed: 0,track_name,predicted_ms
9,Peckham,326870900.0
17,Lady of Neptune,147156900.0
23,Like to Party,44952910.0
0,Tous les jours,29227540.0
3,Recognise,18039050.0
25,Karma,17484880.0
4,Masculine (feat. Burna Boy),16435810.0
20,Woman's Worth,15442720.0
12,G.A.N.G,11942940.0
26,Si j'savais,7705696.0


In [36]:
monthly_albums = (
    df_clean.groupby(['month', 'album_name'])['ms_played']
    .sum()
    .reset_index()
)

total_albums = (
    monthly_albums.groupby('album_name')['ms_played']
    .sum()
    .sort_values(ascending=False)
)

top_albums = total_albums.head(10)
top_albums


album_name
BDLM VOL.1        69796636
I Told Them...    51164460
Morayo            45597714
Memory            44154615
Outside           37668862
DNK               35630902
Formule 7         32813052
Droit Chemin      32346887
9PM IN PARIS      27373865
Made In Lagos     26345678
Name: ms_played, dtype: int64

In [38]:
df_clean.to_csv("../data/listening_history.csv", index=False)
