In [11]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv # Loads .env file.
import os
import time
import pandas as pd

In [12]:
load_dotenv('auth.env')
SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

In [13]:
client_credentials_manager = SpotifyClientCredentials(
    client_id = SPOTIFY_CLIENT_ID,
    client_secret = SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [14]:
def get_track_information(playlist_ids):
    artists = []
    first_artists = []
    track_names = []
    track_ids = []
    
    for playlist_id in playlist_ids:
        resp = sp.playlist_tracks(playlist_id, fields='items.track.artists.name, items.track.name, items.track.id', limit = 100, offset=0, market=None, additional_types=('track'))
        for item in resp['items']:
            track = item['track']
            id = track['id']
            name = track['name']
            
            all_artists = track['artists']
            artists_names = [artist['name'] for artist in all_artists]
            
            artists.append(artists_names)
            first_artists.append(artists_names[0])
            track_names.append(name)
            track_ids.append(id)
    
    return artists, first_artists, track_names, track_ids

In [15]:
def get_track_audio_features(track_ids):
    audio_features_all = []
    batch_size = 100
    num_batches = (len(track_ids) + batch_size - 1) // batch_size

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(track_ids))
        track_ids_batch = track_ids[start_idx:end_idx]
        
        try:
            audio_features_batch = sp.audio_features(tracks = track_ids_batch)
            audio_features_all += audio_features_batch
        except spotipy.SpotifyException as e:
            if e.http_status == 429:
                retry_after = int(e.headers.get('Retry-After', 10))
                print(f"Encountered rate limit. Waiting for {retry_after} seconds before retrying...")
                time.sleep(retry_after + 1)
                audio_features_batch = sp.audio_features(tracks=track_ids_batch)
                audio_features_all += audio_features_batch
            else:
                raise e
            
    return audio_features_all

In [16]:
def make_df(data, audio_features, mood):

    info_df = pd.DataFrame(data, columns = ['track_ids', 'track_names', 'artists', 'first_artists'])
    features_df = pd.DataFrame(data = audio_features, columns = audio_features[0].keys())

    df = pd.concat([info_df, features_df], axis = 1)
    df.reset_index(drop = True, inplace = True)

    track_ids = data['track_ids']
    track_ids_len = len(track_ids)
    mood_list = [mood] * track_ids_len
    df['mood'] = mood_list
    
    return df

In [17]:
happy_playlist_ids = ['37i9dQZF1DX9XIFQuFvzM4', '37i9dQZF1DX889U0CL85jj', '37i9dQZF1DX8Dc28snyWrn', '37i9dQZF1DWYBO1MoTDhZI', '37i9dQZF1DX4fpCWaHOned', '37i9dQZF1DWSf2RDTDayIx', '37i9dQZF1DXa19sXUAHiO1', '37i9dQZF1DX7KNKjOK0o75', '37i9dQZF1DX2sUQwD7tbmL', '37i9dQZF1DWYzpSJHStHHx', '37i9dQZF1DX1BzILRveYHb', '37i9dQZF1DX6fhMYWIyuww']
happy_artists, happy_first_artists, happy_track_names, happy_track_ids = get_track_information(happy_playlist_ids)

In [18]:
audio_features_happy = get_track_audio_features(happy_track_ids)

In [19]:
happy_data = {
    'track_ids': happy_track_ids,
    'track_names': happy_track_names,
    'first_artists': happy_first_artists,
    'artists': happy_artists,
}

happy_df = make_df(happy_data, audio_features_happy, "Happy")
happy_df.to_csv('happy_data.csv', index = False)

In [20]:
sad_playlist_ids = ['37i9dQZF1DWSqBruwoIXkA', '37i9dQZF1DWW2hj3ZtMbuO', '37i9dQZF1DX7gIoKXt0gmx', '37i9dQZF1DWZrBs4FjpxlE', '37i9dQZF1DX59NCqCqJtoH', '37i9dQZF1DWVV27DiNWxkR', '37i9dQZF1DWVrtsSlLKzro', '37i9dQZF1DWZUAeYvs88zc', '37i9dQZF1DWU4lunzhQdRx', '37i9dQZF1DWV1bxlagjEmb', '37i9dQZF1DX9AnYEthXLyU', '37i9dQZF1DX15JKV0q7shD']
sad_artists, sad_first_artists, sad_track_names, sad_track_ids = get_track_information(sad_playlist_ids)

In [21]:
audio_features_sad = get_track_audio_features(sad_track_ids)

In [22]:
sad_data = {
    'track_ids': sad_track_ids,
    'track_names': sad_track_names,
    'first_artists': sad_first_artists,
    'artists': sad_artists,
}

sad_df = make_df(sad_data, audio_features_sad, "Sad")
sad_df.to_csv('sad_data.csv', index = False)

In [23]:
combined_df = pd.concat([happy_df, sad_df], ignore_index = True)
combined_df.to_csv('combined_data.csv', index = False)

In [24]:
combined_df.drop_duplicates(subset = 'id', keep = 'first', inplace = True)
combined_df.reset_index(drop = True, inplace = True)

columns_to_drop = ['analysis_url', 'duration_ms', 'id', 'key', 'liveness', 'speechiness', 'time_signature', 'track_href', 'type', 'uri']
cleaned_df = combined_df.drop(columns=columns_to_drop)
cleaned_df.to_csv('cleaned_data.csv', index = False)