In [67]:
#installation for spotipy-API
!pip install spotipy

#imports
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import zipfile
import pandas as pd
import requests
import io
import subprocess
import time
from io import BytesIO

import warnings
warnings.filterwarnings("ignore")



In [68]:
# Function to read Spotify credentials from a URL
def read_spotify_credentials_from_url(url):
    credentials = {}
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.split('\n')
        for line in lines:
            if '=' in line:
                key, value = line.strip().split('=')
                credentials[key] = value
    else:
        print("Failed to fetch Spotify credentials. Status code:", response.status_code)
    return credentials

In [69]:
def unzip_csv(zip_file_path, csv_file_name):
    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract the CSV file
        zip_ref.extract(csv_file_name, path='.')
    
    # Read the extracted CSV file into a DataFrame
    df = pd.read_csv(csv_file_name)
    
    return df

In [70]:
def load_csv_from_zip(zip_file_url, csv_file_name):
    # Download the zip file
    response = requests.get(zip_file_url)
    if response.status_code != 200:
        raise Exception("Failed to download the zip file")

    # Extract the contents of the zip file
    with zipfile.ZipFile(BytesIO(response.content), 'r') as zip_ref:
        # Find the CSV file within the zip
        csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
        if not csv_files:
            raise Exception("No CSV file found in the zip")

        # Read the first CSV file found (assuming there's only one)
        with zip_ref.open(csv_files[0]) as csv_file:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(csv_file)
            return df

In [71]:
# # Unzip the dataset
zip_file_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/spotify_dataset_recreated.zip'
csv_file_name = 'spotify_dataset_recreated.csv'
data_files = load_csv_from_zip(zip_file_url, csv_file_name)

In [72]:
data_files.shape

(1048575, 4)

In [73]:
# # zip_file_path = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/spotify_dataset_recreated.zip'
# zip_file_path = "C:/Users/eitan/OneDrive/Desktop/TAU/Third Year/ML n DL/Final Project/spotify_playlists/Data/spotify_dataset_recreated.zip"
# csv_file_name = "spotify_dataset_recreated.csv" 
# output_folder = "C:/Users/eitan/OneDrive/Desktop/TAU/Third Year/ML n DL/Final Project/spotify_playlists/Data"
# data_files = unzip_csv(zip_file_path, csv_file_name)

In [74]:
tracks_metadata = data_files[['trackname', 'artistname']].drop_duplicates()
tracks_metadata = tracks_metadata.sort_values(by='trackname')

In [75]:
tracks_metadata.shape

(483397, 2)

In [76]:
missing_values = tracks_metadata.isna().any()

missing_values_count = tracks_metadata.isna().sum()
missing_values_count

trackname        6
artistname    2032
dtype: int64

In [77]:
# Remove rows with missing values
cleaned_tracks_metadata = tracks_metadata.dropna()

In [78]:
cleaned_tracks_metadata.shape

(481360, 2)

In [79]:
def get_track_metadata_from_api(tracks_metadata_df, sp):
    extended_tracks_metadata = []
    start_time = time.time()  # Record start time
    tracks_added = 0  # Initialize tracks counter
    tracks_not_found = 0  # Initialize counter for tracks not found
    exception_occurred = False  # Flag to track if an exception occurred
    
    try:
        for index, row in tracks_metadata_df.iterrows():
            if exception_occurred:
                break  # Break out of the loop if an exception occurred
            
            track_name = row['trackname']
            artist_name = row['artistname']
            
            # Search for the track
            try:
                results = sp.search(q=f"track:{track_name} artist:{artist_name}", type='track', limit=1)
                if results['tracks']['items']:
                    track_info = results['tracks']['items'][0]
                    track_id = track_info['id']
                    duration_ms = track_info['duration_ms']
                    explicit = track_info['explicit']
                    # key = track_info['key']
                    
                    artist_uri = track_info["artists"][0]["uri"]
                    artist_info = sp.artist(artist_uri)
                    track_features = sp.audio_features(track_info['uri'])[0]

                    if track_features is None:
                        print(f"{index}. No audio features found for track '{track_name}'")
                        tracks_not_found += 1
                        continue  # Skip this track if no audio features are found

                    # Extract release date
                    release_date = track_info.get('album', {}).get('release_date', '')

                    # Create a list with all the metadata
                    metadata_list = [
                        track_id,
                        track_name,
                        artist_name,
                        duration_ms,
                        explicit,
                        # key,
                        track_features['acousticness'],
                        track_features['danceability'],
                        track_features['energy'],
                        track_features['instrumentalness'],
                        track_features['speechiness'],
                        track_features['liveness'],
                        track_features['loudness'],
                        track_features['tempo'],
                        track_features['valence'],
                        track_info['popularity'],
                        release_date,
                        artist_info["genres"]
                    ]

                    # Add the metadata list to the extended_tracks_metadata list
                    extended_tracks_metadata.append(metadata_list)
                    print(f"{index}. Extended metadata for track '{track_name} by {artist_name}': {metadata_list}")
                    tracks_added += 1
                else:
                    print(f"{index}. No results found for track '{track_name} by {artist_name}'")
                    tracks_not_found += 1
            except Exception as e:  # Catch all exceptions
                print(f"Error while fetching track metadata: {e}")
                exception_occurred = True  # Set flag to indicate an exception occurred
        
    except Exception as outer_e:
        print(f"Outer Exception: {outer_e}")
        
    # Convert the list of lists to a DataFrame
    columns = ['track_id', 'trackname', 'artistname', 'duration_ms', 'explicit', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'speechiness', 'liveness', 'loudness', 'tempo', 'valence', 'popularity', 'release_date', 'genres']
    extended_tracks_df = pd.DataFrame(extended_tracks_metadata, columns=columns)
    
    # Sort the DataFrame by trackname and artistname
    extended_tracks_df.sort_values(by=['trackname', 'artistname'], inplace=True)
    
    # Print total elapsed time and final index
    elapsed_time = time.time() - start_time
    print(f"\n---------------------------------------------------------------")
    print(f"Total elapsed time: {elapsed_time:.2f} seconds")
    print(f"Songs found: {tracks_added}")
    print(f"Songs not found: {tracks_not_found}")
    return extended_tracks_df


In [80]:
# Read Spotify credentials from URL
spotify_credentials_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/eitans_spotify_credentials.txt'
spotify_credentials = read_spotify_credentials_from_url(spotify_credentials_url)

# Initialize Spotipy client with the obtained credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=spotify_credentials['CLIENT_ID'],
                                                           client_secret=spotify_credentials['CLIENT_SECRET']))

In [81]:
cleaned_tracks_metadata.reset_index(drop=True, inplace=True)
cleaned_tracks_metadata.index += 1
cleaned_tracks_metadata.head()

Unnamed: 0,trackname,artistname
1,"Jesús ""Triana""","Nacho Ruiz, Manuel Pérez Lolo"""
2,"Jr./Raphael Saadiq/Stevie Wonder""","Charles CJ"" Hilton"
3,nº 2,Banda de la División de Infantería Mecanizada ...
4,!!!!!!!,The Roots
5,!H.a.p.p.y!,Dawid Podsiadlo


In [82]:
partitioned_df = cleaned_tracks_metadata[:50]
#- add the following features to extract from spotify api: id, duration_ms, explicit, key

In [83]:
extended_tracks_metadata = get_track_metadata_from_api(cleaned_tracks_metadata, sp)

1. No results found for track ' Jesús "Triana" by Nacho Ruiz, Manuel Pérez Lolo"'
2. No results found for track ' Jr./Raphael Saadiq/Stevie Wonder" by Charles CJ" Hilton'
3. No results found for track ' nº 2 by Banda de la División de Infantería Mecanizada Guzmán el Bueno"'


Max Retries reached


Error while fetching track metadata: http status: 429, code:-1 - /v1/audio-features/?ids=5hxu7IKCA8m9RkktHsnJtA:
 Max Retries, reason: too many 429 error responses

---------------------------------------------------------------
Total elapsed time: 2.68 seconds
Songs found: 0
Songs not found: 3


In [30]:
extended_tracks_metadata.index+=1

In [31]:
extended_tracks_metadata

Unnamed: 0,track_id,trackname,artistname,duration_ms,explicit,acousticness,danceability,energy,instrumentalness,speechiness,liveness,loudness,tempo,valence,popularity,release_date,genres
