In [4]:
import os
import time
import pandas as pd
import requests
import json

**Paths**

In [9]:
DATA = '..\\..\\data'
TOKEN = f'{DATA}\\token'
TO_SCRAP = f'{DATA}\\to_scrap'
SCRAPED = f'{DATA}\\scraped'
ERRORS = f'{DATA}\\errors'

In [10]:
print(os.path.abspath(TOKEN))

C:\Users\juanm\OneDrive\Bureau\ESGI - Projets\4IABD\Projet Annuel\data\token


**Load Dataset**

In [227]:
df = pd.read_csv('data/dataset_to_erase.csv', usecols=['artist_name', 'track_name', 'release_date', 'genre'])

In [228]:
df.shape

(28372, 4)

In [229]:
df['release_date'] = [decade if str(decade)[-1] == 0 else f'{str(decade)[:3]}0' for decade in df.release_date]
df['release_date'] = df.release_date.astype(int)
df = df.sort_values(by='release_date')

In [None]:
df

In [231]:
df.release_date.value_counts()

2010    5631
2000    4781
1980    4675
1990    4457
1970    3951
1960    3409
1950    1468
Name: release_date, dtype: int64

**Setup For Spotify API Use**

In [18]:
spotify_api = 'https://api.spotify.com/v1'
playlists_ids = {1950: "3sivPPUaXu3xvFghYCafV6",
                 1960: "6BHjXb3Q1E3VIORY95nDoO",
                 1970: "7LXey9CwaP0JCyHVn1xk6Y",
                 1980: "2jqDPgE9kV1vS7jvMieUsZ",
                 1990: "09VspjofcWG61Man3lwnrH",
                 2000: "0QATGfhmGTGVbbDMHiIkb2",
                 2010: "6lLFDVFKzidhfJLJ1HsE4t"}

**Generate Token For Spotify API Requests**

In [13]:
# Generate token
def generate_token() -> str:
    f = open('data/token/auth_spotify_api.json')
    auth_spotify_api = json.load(f)
    response = requests.post('https://accounts.spotify.com/api/token',
                             data={
                                 'Content_type': 'application/x-www-form-urlencoded',
                                 'grant_type': 'client_credentials',
                                 'client_id': auth_spotify_api.get('client_id'),
                                 'client_secret': auth_spotify_api.get('client_secret')
                             }).json()['access_token']
    return response

In [14]:
def regen_token(token: str, timestamp: float) -> tuple[str, float]:
    t = time.time()
    if t - timestamp >= 3600:
        return generate_token(), t
    return token, timestamp

**Generate Dataset From Scratch / Reset Existing Dataset**

In [235]:
pd.DataFrame({
    'artist_name': pd.Series(dtype='str'),
    'track_name': pd.Series(dtype='str'),
    'release_date': pd.Series(dtype='str'),
    'genre': pd.Series(dtype='str'),
    'spotify_id': pd.Series(dtype='str'),
    'playlist_id': pd.Series(dtype='str')
}).to_csv('data/dataset.csv', index=False)

In [236]:
pd.DataFrame({
    'artist_name': pd.Series(dtype='str'),
    'track_name': pd.Series(dtype='str'),
    'release_date': pd.Series(dtype='str'),
    'genre': pd.Series(dtype='str'),
    'error_type': pd.Series(dtype='str')
}).to_csv('data/songs_on_error.csv', index=False)

**API Calls To Spotify**

In [237]:
timestamp = time.time()
token = generate_token()
for rows in df.values:
    token, timestamp = regen_token(token, timestamp)
    response = requests.get(f'{spotify_api}/search?q=track:{rows[1]}%20artist:{rows[0]}&type=track',
                            headers={
                                'Authorization': f'Bearer {token}'
                            })
    if response.status_code == 429:
        break
    if response.status_code == 200 and response.json()['tracks']['total'] > 0:
        pd.DataFrame({'artist_name': rows[0],
                      'track_name': rows[1],
                      'release_date': rows[2],
                      'genre': rows[3],
                      'spotify_id': response.json()['tracks']['items'][0]['id'],
                      'playlist_id': playlists_ids[rows[2]]}, index=[0]) \
            .to_csv('data/dataset.csv', index=False, header=False, mode='a')
    else:
        pd.DataFrame({'artist_name': rows[0],
                      'track_name': rows[1],
                      'release_date': rows[2],
                      'genre': rows[3],
                      'error_type': response.status_code}, index=[0]) \
            .to_csv('data/songs_on_error.csv', index=False, header=False, mode='a')

**Delete Rows Already Existing In Dataset**

In [9]:
dataset = pd.read_csv('data/dataset.csv')
dataset.release_date.value_counts()

2000    4533
1980    4414
1990    4257
1970    3714
1960    3170
2010    2157
1950    1344
Name: release_date, dtype: int64

In [241]:
to_remove = pd.read_csv('data/dataset_to_erase.csv', usecols=['artist_name', 'track_name', 'release_date', 'genre'])

In [247]:
concat = pd.concat([dataset, to_remove])
concat[concat.duplicated()].to_csv('data/dataset_to_erase.csv', index=False)

**Create Playlists From Dataset**

In [11]:
dataset = dataset.groupby(by='release_date')
dataset

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

In [31]:
timestamp = time.time()
token = generate_token()
for decade in playlists_ids.keys():
    token, timestamp = regen_token(token, timestamp)
    spotify_id = [f"spotify:track:{data}" for data in dataset['spotify_id'].get_group(decade)]
    print(f'{decade} : {len(spotify_id)}')
    print(playlists_ids[decade])
    for i in range(0, len(spotify_id), 100):
        response = requests.post(f'{spotify_api}/playlists/{playlists_ids[decade]}/tracks',
                                 data={
                                     'uris': spotify_id[i:i + 100]
                                 },
                                 headers={
                                     'Content-Type': 'application/x-www-form-urlencoded',
                                     'Authorization': f'Bearer {token}'
                                 })
        print(response)
        if response.status_code != 200:
            break
    break

1950 : 1344
3sivPPUaXu3xvFghYCafV6
<Response [403]>
