![spotify_logo](images/spotify_logo3.png)

# Spotify API Scrape

## Helpful Links:
- [Spotify Web API - Authorization Guide](https://developer.spotify.com/documentation/general/guides/authorization-guide/)
- [Spotify API References](https://developer.spotify.com/documentation/web-api/reference/)

In [85]:
import config
import os
import requests
import json
from json import JSONEncoder
import pandas as pd
from datetime import datetime,timezone, timedelta
from tqdm import tqdm
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import base64
from urllib.parse import urlencode

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import music21

music21: Certain music21 functions might need these optional packages: matplotlib, scipy;
                   if you run into errors, install them by following the instructions at
                   http://mit.edu/music21/doc/installing/installAdditional.html


## Step 1: Get Access Token

In [2]:
client_id = config.client_id
client_secret = config.client_secret

In [3]:
username = config.username
client_id = config.client_id
client_secret = config.client_secret
redirect_uri = 'http://localhost:7777/callback'
scope = 'user-read-recently-played'

auth_token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)

## Step 2: Pull Recently Played

In [4]:
base_url = 'https://api.spotify.com/v1/me/player/recently-played?'
#track_id = '6y0igZArWVi6Iz0rj35c1Y'

#2. Authentication
#3. Parameters -- would be stored with authentication
headers = {
    "Authorization": f"Bearer {auth_token}"
}

#4. Create an empty list
personal_data = [] #would be good explore how to capture data at different points in time
r = requests.get(base_url+"&limit=50", headers=headers)
personal_data.append(json.loads(r.text))

In [26]:
personal_data[0]['items'][1]

{'track': {'album': {'album_type': 'single',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4YZ5ECfbM2xSTSQTJGBbO5'},
     'href': 'https://api.spotify.com/v1/artists/4YZ5ECfbM2xSTSQTJGBbO5',
     'id': '4YZ5ECfbM2xSTSQTJGBbO5',
     'name': 'Gerard Way',
     'type': 'artist',
     'uri': 'spotify:artist:4YZ5ECfbM2xSTSQTJGBbO5'}],
   'available_markets': ['AD',
    'AE',
    'AL',
    'AR',
    'AT',
    'AU',
    'BA',
    'BE',
    'BG',
    'BH',
    'BO',
    'BR',
    'BY',
    'CA',
    'CH',
    'CL',
    'CO',
    'CR',
    'CY',
    'CZ',
    'DE',
    'DK',
    'DO',
    'DZ',
    'EC',
    'EE',
    'EG',
    'ES',
    'FI',
    'FR',
    'GB',
    'GR',
    'GT',
    'HK',
    'HN',
    'HR',
    'HU',
    'ID',
    'IE',
    'IL',
    'IN',
    'IS',
    'IT',
    'JO',
    'JP',
    'KW',
    'KZ',
    'LB',
    'LI',
    'LT',
    'LU',
    'LV',
    'MA',
    'MC',
    'MD',
    'ME',
    'MK',
    'MT',
    'MX',
    'MY',
    'NI',
    

In [6]:
track_ids = []
album_names = []
artist_names = []
track_names = []

for i in range(len(personal_data[0]['items'])):
    track_ids.append(personal_data[0]['items'][i]['track']['id']) # Track ID
    album_names.append(personal_data[0]['items'][i]['track']['album']['name']) # Album Name
    artist_names.append(personal_data[0]['items'][i]['track']['artists'][0]['name']) # Artist Name
    track_names.append(personal_data[0]['items'][i]['track']['name']) # Track Name

In [7]:
list_dic={'track_id':track_ids,
          'album_name':album_names,
          'artist_name':artist_names,
          'track_name':track_names,
    }

In [8]:
df1=pd.DataFrame(list_dic)
df1

Unnamed: 0,track_id,album_name,artist_name,track_name
0,0klnCzWGjoRQBjoPYe44Gl,Here Comes the End (feat. Judith Hill),Gerard Way,Here Comes the End (feat. Judith Hill)
1,0klnCzWGjoRQBjoPYe44Gl,Here Comes the End (feat. Judith Hill),Gerard Way,Here Comes the End (feat. Judith Hill)
2,0klnCzWGjoRQBjoPYe44Gl,Here Comes the End (feat. Judith Hill),Gerard Way,Here Comes the End (feat. Judith Hill)
3,64lsIF5pw0sJY0gV5kz0RN,Fuzzybrain,Dayglow,Can I Call You Tonight?
4,64lsIF5pw0sJY0gV5kz0RN,Fuzzybrain,Dayglow,Can I Call You Tonight?
5,64lsIF5pw0sJY0gV5kz0RN,Fuzzybrain,Dayglow,Can I Call You Tonight?
6,4GJvolzpuvX8Fp38V1UQCT,Bad Things,Social Animals,Bad Things
7,5vNd5a48igf00CALQgTso1,Comme Ça,South of France,Comme Ça
8,3iRiQJsQmH0yPIOJn3Y4WQ,Beacon,Matt Duncan,Heart & Arrow
9,0VjIjW4GlUZAMYd2vXMi3b,After Hours,The Weeknd,Blinding Lights


## Step 3: Pull User's Top Artists & Tracks

In [9]:
username = config.username
client_id = config.client_id
client_secret = config.client_secret
redirect_uri = 'http://localhost:7777/callback'
scope = 'user-top-read'

auth_token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)

In [10]:
base_url = 'https://api.spotify.com/v1/me/top/tracks?'
#track_id = '6y0igZArWVi6Iz0rj35c1Y'

#2. Authentication
#3. Parameters -- would be stored with authentication
headers = {
    "Authorization": f"Bearer {auth_token}"
}

#4. Create an empty list
top_track_data = [] #would be good explore how to capture data at different points in time
r = requests.get(base_url+"time_range=medium_term"+"&&limit=50", headers=headers)
top_track_data.append(json.loads(r.text))

In [11]:
top_track_data[0]

{'items': [{'album': {'album_type': 'ALBUM',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/06HL4z0CvFAxyc27GXpf02'},
      'href': 'https://api.spotify.com/v1/artists/06HL4z0CvFAxyc27GXpf02',
      'id': '06HL4z0CvFAxyc27GXpf02',
      'name': 'Taylor Swift',
      'type': 'artist',
      'uri': 'spotify:artist:06HL4z0CvFAxyc27GXpf02'}],
    'available_markets': ['AD',
     'AE',
     'AR',
     'AT',
     'AU',
     'BE',
     'BG',
     'BH',
     'BO',
     'BR',
     'CA',
     'CH',
     'CL',
     'CO',
     'CR',
     'CY',
     'CZ',
     'DE',
     'DK',
     'DO',
     'DZ',
     'EC',
     'EE',
     'EG',
     'ES',
     'FI',
     'FR',
     'GB',
     'GR',
     'GT',
     'HK',
     'HN',
     'HU',
     'ID',
     'IE',
     'IL',
     'IN',
     'IS',
     'IT',
     'JO',
     'JP',
     'KW',
     'LB',
     'LI',
     'LT',
     'LU',
     'LV',
     'MA',
     'MC',
     'MT',
     'MX',
     'MY',
     'NI',
     'NL',
     'NO',
 

In [16]:
track_idss = []
album_namess = []
album_relase_datess = []
artist_namess = []
track_namess = []

for i in range(len(top_track_data[0]['items'])):
    track_idss.append(top_track_data[0]['items'][i]['id']) # Track ID
    album_namess.append(top_track_data[0]['items'][i]['album']['name']) # Album Name
    album_relase_datess.append(top_track_data[0]['items'][i]['album']['release_date'])
    artist_namess.append(top_track_data[0]['items'][i]['album']['artists'][0]['name']) # Artist Name
    track_namess.append(top_track_data[0]['items'][i]['name']) # Track Name

In [17]:
list_dic2={'track_id':track_idss,
           'track_name':track_namess,
           'album_name':album_namess,
           'artist_name':artist_namess,
           'album_relase_date':album_relase_datess,
    }

In [18]:
df2=pd.DataFrame(list_dic2)
df2

Unnamed: 0,track_id,track_name,album_name,artist_name,album_relase_date
0,4pvb0WLRcMtbPGmtejJJ6y,exile (feat. Bon Iver),folklore,Taylor Swift,2020-07-24
1,6xZ4Q2k2ompmDppyeESIY8,Level of Concern,Level of Concern,Twenty One Pilots,2020-04-09
2,4w2tfK0JA8KrVegKnxukf4,The Kids Are Alright,404,Barns Courtney,2019-09-06
3,1ci0BoqpvH73L2TJzHhw9y,Modern Chemistry,Tell All Your Friend,Okey Dokey,2019-03-22
4,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,After Hours,The Weeknd,2020-03-20
5,6xQpOC55lufcqXvSzp7GTb,Hard To Be Alone,Hard To Be Alone,Barns Courtney,2020-07-10
6,1uddOsj7TyRA13hnS2yDyk,Can't Take My Eyes Off You,Can't Take My Eyes Off You / 3BadSoSad - Edit,Private Island,2019-11-15
7,2nUV1fiD45RN6cQZ85GDc1,Salt (Nlmg),The Light Left Over,Ben Hon,2019-04-05
8,3flgdcFBWI84DPi4s1jhhd,Savannah,Something You Needed,Flipturn,2020-02-07
9,3FRJFImdfX5NSY3QH3jI4u,Glistening,Something You Needed,Flipturn,2020-02-07


## Step 4: Pull Track Info
- API Doc: https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-analysis/

In [59]:
track_id_ls = df2['track_id'].tolist()

In [60]:
track_id_ls

['4pvb0WLRcMtbPGmtejJJ6y',
 '6xZ4Q2k2ompmDppyeESIY8',
 '4w2tfK0JA8KrVegKnxukf4',
 '1ci0BoqpvH73L2TJzHhw9y',
 '0VjIjW4GlUZAMYd2vXMi3b',
 '6xQpOC55lufcqXvSzp7GTb',
 '1uddOsj7TyRA13hnS2yDyk',
 '2nUV1fiD45RN6cQZ85GDc1',
 '3flgdcFBWI84DPi4s1jhhd',
 '3FRJFImdfX5NSY3QH3jI4u',
 '3iRiQJsQmH0yPIOJn3Y4WQ',
 '5PNAsZO4JyT8fUzPyKwZ7W',
 '4ZdmTNaBTErD8n9AQE0YaX',
 '0elCmyfISzkP5tAYTVuYjS',
 '00c805RUf4tZyrlycaA2AQ',
 '5vNd5a48igf00CALQgTso1',
 '1MgV7FIyNxIG7WzMRJV5HC',
 '1RgmiiAY2FVLuGcG04ah4F',
 '2n33zaHpprIqOiSyoC8I5F',
 '4kiOoB3NVLKrs61jTNieAr',
 '7dz48pntblPzJ9mTPiUH81',
 '0ZNU020wNYvgW84iljPkPP',
 '3hUxzQpSfdDqwM3ZTFQY0K',
 '7kt9e9LFSpN1zQtYEl19o1',
 '7Cy7wa746ywgkwIOFJtSor',
 '6YjFI4i4mUHL54T67ucGj6',
 '2t0TmblCJctqK13OyRdSmD',
 '72z1OAURj2XwHbZdBg3zpV',
 '2BIfG6wL1t5wk1KixoK2BV',
 '7atYf4ccJ8L5QA4dzXcQN1',
 '0XZV3zE3j10RPG6Zvxsb6n',
 '2Eeur20xVqfUoM3Q7EFPFt',
 '5wxjGTx4Q8esdYMd7SWHZI',
 '0FDtfURDoNGv0sjNnvscPR',
 '5G1sTBGbZT5o4PNRc75RKI',
 '2NmsngXHeC1GQ9wWrzhOMf',
 '38orSZZSkX6gxDC2enAM4k',
 

In [70]:
track_data = []
for id in tqdm(track_id_ls):
    base_url = f'https://api.spotify.com/v1/audio-analysis/{id}?'

    #2. Authentication
    #3. Parameters -- would be stored with authentication
    headers = {
        "Authorization": f"Bearer {auth_token}"
    }

    r = requests.get(base_url, headers=headers)
    track_data.append(json.loads(r.text))

100%|██████████| 50/50 [00:40<00:00,  1.22it/s]


In [74]:
len(track_data)

50

In [113]:
duration_ls = []
loudness_ls = []
tempo_ls = []
key_ls = []
mode_ls = []

for i in tqdm(range(len(track_data))):
    duration_ls.append(track_data[i]['track']['duration'])
    loudness_ls.append(track_data[i]['track']['loudness'])
    tempo_ls.append(track_data[i]['track']['tempo'])
    key_ls.append(track_data[i]['track']['key'])
    mode_ls.append(track_data[i]['track']['mode'])

100%|██████████| 50/50 [00:00<00:00, 1667.72it/s]


In [114]:
list_dic3={'track_id':track_id_ls,
           'duration':duration_ls,
           'loudness':loudness_ls,
           'tempo':tempo_ls,
           'key':key_ls,
           'mode':mode_ls
    }

In [127]:
df3=pd.DataFrame(list_dic3)
df3

Unnamed: 0,track_id,duration,loudness,tempo,key,mode
0,4pvb0WLRcMtbPGmtejJJ6y,285.63416,-8.426,75.602,6,1
1,6xZ4Q2k2ompmDppyeESIY8,220.05138,-7.34,122.012,4,0
2,4w2tfK0JA8KrVegKnxukf4,235.77333,-4.722,122.036,3,1
3,1ci0BoqpvH73L2TJzHhw9y,230.21333,-7.506,89.997,5,1
4,0VjIjW4GlUZAMYd2vXMi3b,200.04,-5.934,171.005,1,1
5,6xQpOC55lufcqXvSzp7GTb,182.05333,-6.52,94.087,3,1
6,1uddOsj7TyRA13hnS2yDyk,209.0,-5.837,123.996,5,0
7,2nUV1fiD45RN6cQZ85GDc1,202.84,-7.833,136.588,6,1
8,3flgdcFBWI84DPi4s1jhhd,69.07184,-20.142,104.443,4,1
9,3FRJFImdfX5NSY3QH3jI4u,247.55905,-12.066,127.011,3,1


## WIP

In [134]:

for i in df3[['mode']]:
    if i == 1:
        df3['mode1'] = 'Major'
    else:
        df3['mode1'] = 'Minor'

In [135]:
df3

Unnamed: 0,track_id,duration,loudness,tempo,key,mode,mode1
0,4pvb0WLRcMtbPGmtejJJ6y,285.63416,-8.426,75.602,6,1,Minor
1,6xZ4Q2k2ompmDppyeESIY8,220.05138,-7.34,122.012,4,0,Minor
2,4w2tfK0JA8KrVegKnxukf4,235.77333,-4.722,122.036,3,1,Minor
3,1ci0BoqpvH73L2TJzHhw9y,230.21333,-7.506,89.997,5,1,Minor
4,0VjIjW4GlUZAMYd2vXMi3b,200.04,-5.934,171.005,1,1,Minor
5,6xQpOC55lufcqXvSzp7GTb,182.05333,-6.52,94.087,3,1,Minor
6,1uddOsj7TyRA13hnS2yDyk,209.0,-5.837,123.996,5,0,Minor
7,2nUV1fiD45RN6cQZ85GDc1,202.84,-7.833,136.588,6,1,Minor
8,3flgdcFBWI84DPi4s1jhhd,69.07184,-20.142,104.443,4,1,Minor
9,3FRJFImdfX5NSY3QH3jI4u,247.55905,-12.066,127.011,3,1,Minor


In [104]:
type(test['key'])

pandas.core.series.Series

In [105]:
music_dic = {
    0: 'C',
    1: 'C#/Db',
    2: 'D',
    3: 'D#/Eb',
    4: 'E',
    5: 'F',
    6: 'F#/Gb',
    7: 'G',
    8: 'G#/Ab',
    9: 'A',
    10: 'A#/Bb',
    11: 'B'
}

In [110]:
test_dict = {v : k for k, v in music_dic.items()}

In [111]:
test['key'] = test['key'].map(test_dict)

In [112]:
test

Unnamed: 0,track_id,duration,loudness,tempo,key
0,4pvb0WLRcMtbPGmtejJJ6y,285.63416,-8.426,75.602,
1,6xZ4Q2k2ompmDppyeESIY8,220.05138,-7.34,122.012,
2,4w2tfK0JA8KrVegKnxukf4,235.77333,-4.722,122.036,
3,1ci0BoqpvH73L2TJzHhw9y,230.21333,-7.506,89.997,
4,0VjIjW4GlUZAMYd2vXMi3b,200.04,-5.934,171.005,
5,6xQpOC55lufcqXvSzp7GTb,182.05333,-6.52,94.087,
6,1uddOsj7TyRA13hnS2yDyk,209.0,-5.837,123.996,
7,2nUV1fiD45RN6cQZ85GDc1,202.84,-7.833,136.588,
8,3flgdcFBWI84DPi4s1jhhd,69.07184,-20.142,104.443,
9,3FRJFImdfX5NSY3QH3jI4u,247.55905,-12.066,127.011,


In [63]:
print(test[0]['track']['duration'] / 60)
print(test[0]['track']['loudness'])
print(test[0]['track']['tempo'])
print(test[0]['track']['key'])

4.760569333333334
-8.426
75.602
6


In [64]:
time = test[0]['track']['duration']
minutes = time // 60
time %= 60
seconds = time

In [58]:
print("%d:%d" % (minutes, seconds))

4:45
