<a href="https://colab.research.google.com/github/ariel-hedvat/AdvancedMLDLCourseAssignments/blob/main/Experiments/data_extractuon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#installation for spotipy-API
!pip install spotipy



In [2]:
#imports
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
import os
import zipfile
import pandas as pd
import requests
import io
import subprocess
import time

In [3]:
# Function to read Spotify credentials from a URL
def read_spotify_credentials_from_url(url):
    credentials = {}
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.split('\n')
        for line in lines:
            if '=' in line:
                key, value = line.strip().split('=')
                credentials[key] = value
    else:
        print("Failed to fetch Spotify credentials. Status code:", response.status_code)
    return credentials

In [4]:
# Function to unzip the dataset
def unzip_dataset(zip_file_url):
    response = requests.get(zip_file_url)
    with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
        extracted_files = {}
        for file in zip_ref.namelist():
          if file.endswith('.csv'):
                extracted_files[file] = pd.read_csv(io.BytesIO(zip_ref.read(file)), encoding='utf-8')
    return extracted_files

In [5]:
# Unzip the dataset
#zip_file_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/full_spotify_dataset_recreated.zip'
zip_file_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/50.zip'
data_files = unzip_dataset(zip_file_url)

In [6]:
data_files

{'50.csv':                              user_id                        artistname  \
 0   9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 1   9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 2   9cc0cfd4d7d7885102480dd99e7a90d6                      Tiffany Page   
 3   9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 4   9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 5   9cc0cfd4d7d7885102480dd99e7a90d6                            Lissie   
 6   9cc0cfd4d7d7885102480dd99e7a90d6                    Paul McCartney   
 7   9cc0cfd4d7d7885102480dd99e7a90d6                          Joe Echo   
 8   9cc0cfd4d7d7885102480dd99e7a90d6                    Paul McCartney   
 9   9cc0cfd4d7d7885102480dd99e7a90d6                            Lissie   
 10  9cc0cfd4d7d7885102480dd99e7a90d6                      The Breakers   
 11  9cc0cfd4d7d7885102480dd99e7a90d6                    Paul McCartney   
 12  9cc0cfd4d7

In [7]:
# Function to extract track names and artist names into a dictionary
def extract_tracks_metadata(data_files):
    tracks_metadata = {}
    for df_name, df in data_files.items():
        print(f"Processing DataFrame: {df_name}")
        # Strip leading and trailing spaces from column names
        df.columns = df.columns.str.strip()
        print(f"Columns in DataFrame: {df.columns}")
        if 'trackname' in df.columns and 'artistname' in df.columns:
            print("Both 'trackname' and 'artistname' columns found.")
            for index, row in df.iterrows():
                track_name = row['trackname']
                artist_name = row['artistname']
                print(f"Extracting track: {track_name}, Artist: {artist_name}")
                if track_name not in tracks_metadata:
                    tracks_metadata[track_name] = artist_name
                else:
                    print(f"Track '{track_name}' already exists in metadata.")
        else:
            print("Either 'trackname' or 'artistname' columns missing in DataFrame.")
    return tracks_metadata

    #{trackname: artistname}


In [8]:
def get_track_metadata_from_api(tracks_metadata, sp):
    extended_tracks_metadata = {}
    for track_name, artist_name in tracks_metadata.items():
        # Search for the track
        try:
            results = sp.search(q=f"track:{track_name} artist:{artist_name}", type='track', limit=1)
            if results['tracks']['items']:
                track_info = results['tracks']['items'][0]
                artist_uri = track_info["artists"][0]["uri"]
                artist_info = sp.artist(artist_uri)
                track_features = sp.audio_features(track_info['uri'])[0]

                # Create a list with all the metadata
                metadata_list = [
                    artist_name,
                    track_features['acousticness'],
                    track_features['danceability'],
                    track_features['energy'],
                    track_features['instrumentalness'],
                    track_features['speechiness'],
                    track_features['liveness'],
                    track_features['loudness'],
                    track_features['tempo'],
                    track_features['valence'],
                    track_info['popularity'],
                    artist_info["genres"]
                ]

                # Add the metadata list to the extended_tracks_metadata dictionary
                extended_tracks_metadata[track_name] = metadata_list
            else:
                print(f"No results found for track: {track_name}")
        except sp.SpotifyException as e:
            if e.http_status == 429:  # Too Many Requests
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(10)  # Wait for 10 seconds before retrying
            else:
                print(f"Error while fetching track metadata: {e}")
    return extended_tracks_metadata


 #{trackname: [artistname, acousticness, danceability, energy, instrumentalness, speechiness, liveness, loudness, tempo, valence, popularity, genres]}

In [9]:
from google.colab import files

# Function to save tracks metadata as a CSV file
def save_tracks_metadata_to_csv(extended_tracks_metadata, output_file):
    # Create a DataFrame from tracks_metadata dictionary
    df = pd.DataFrame.from_dict(extended_tracks_metadata, orient='index')

    # Reset index and rename columns
    df.reset_index(inplace=True)
    df.columns = ['Track Name', 'Artist Name', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness',
                  'Speechiness', 'Liveness', 'Loudness', 'Tempo', 'Valence', 'Popularity', 'Genres']

    # Save DataFrame to CSV file
    df.to_csv(output_file, index=False)

    # Download the CSV file
    files.download(output_file)

In [10]:
# Extract track metadata
tracks_metadata = extract_tracks_metadata(data_files)

Processing DataFrame: 50.csv
Columns in DataFrame: Index(['user_id', 'artistname', 'trackname', 'playlistname'], dtype='object')
Both 'trackname' and 'artistname' columns found.
Extracting track: (The Angels Wanna Wear My) Red Shoes, Artist: Elvis Costello
Extracting track: (What's So Funny 'Bout) Peace, Love And Understanding, Artist: Elvis Costello & The Attractions
Extracting track: 7 Years Too Late, Artist: Tiffany Page
Extracting track: Accidents Will Happen, Artist: Elvis Costello & The Attractions
Extracting track: Alison, Artist: Elvis Costello
Extracting track: All Be Okay, Artist: Lissie
Extracting track: Band On The Run, Artist: Paul McCartney
Extracting track: Beautiful, Artist: Joe Echo
Extracting track: Blackbird - Live at CitiField, NYC - Digital Audio, Artist: Paul McCartney
Extracting track: Bright Side, Artist: Lissie
Extracting track: Dance The Go-Go, Artist: The Breakers
Extracting track: Dance Tonight, Artist: Paul McCartney
Extracting track: Decision Time, Artist:

In [11]:
tracks_metadata

{'(The Angels Wanna Wear My) Red Shoes': 'Elvis Costello',
 "(What's So Funny 'Bout) Peace, Love And Understanding": 'Elvis Costello & The Attractions',
 '7 Years Too Late': 'Tiffany Page',
 'Accidents Will Happen': 'Elvis Costello & The Attractions',
 'Alison': 'Elvis Costello',
 'All Be Okay': 'Lissie',
 'Band On The Run': 'Paul McCartney',
 'Beautiful': 'Joe Echo',
 'Blackbird - Live at CitiField, NYC - Digital Audio': 'Paul McCartney',
 'Bright Side': 'Lissie',
 'Dance The Go-Go': 'The Breakers',
 'Dance Tonight': 'Paul McCartney',
 'Decision Time': 'The Coronas',
 "Don't Dream It's Over": 'Crowded House',
 'Everywhere I Go': 'Lissie',
 'Fall At Your Feet': 'Crowded House',
 'Four Seasons In One Day': 'Crowded House',
 'Friend Like You': 'Joshua Radin',
 'Get Lost Get High Get Sick': 'The Breakers',
 'Goodtimes': 'Cocktail Slippers',
 'Helplessly Hoping': 'Crosby, Stills & Nash',
 'Here Before': 'Lissie',
 'Here We Go': 'Cocktail Slippers',
 'I Am The Blaze': 'Tiffany Page',
 "I Do

In [12]:
# Read Spotify credentials from URL
spotify_credentials_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/eitans_spotify_credentials.txt'
spotify_credentials = read_spotify_credentials_from_url(spotify_credentials_url)

# Initialize Spotipy client with the obtained credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=spotify_credentials['CLIENT_ID'],
                                                           client_secret=spotify_credentials['CLIENT_SECRET']))

In [13]:
# Assuming tracks_metadata is already populated with track names and artist names
# Call the function to retrieve metadata for each track
extended_tracks_metadata = get_track_metadata_from_api(tracks_metadata, sp)

No results found for track: 7 Years Too Late
No results found for track: Blackbird - Live at CitiField, NYC - Digital Audio
No results found for track: Bright Side
No results found for track: I'd Rather Be With You [Radio Edit]
No results found for track: Oliver's Army


In [14]:
extended_tracks_metadata

{'(The Angels Wanna Wear My) Red Shoes': ['Elvis Costello',
  0.0556,
  0.577,
  0.636,
  0.00016,
  0.0491,
  0.226,
  -9.825,
  134.768,
  0.901,
  41,
  ['new wave',
   'new wave pop',
   'permanent wave',
   'power pop',
   'singer-songwriter']],
 "(What's So Funny 'Bout) Peace, Love And Understanding": ['Elvis Costello & The Attractions',
  0.000466,
  0.381,
  0.918,
  0.0511,
  0.0803,
  0.321,
  -9.451,
  142.34,
  0.368,
  44,
  ['art rock',
   'heartland rock',
   'new romantic',
   'new wave',
   'new wave pop',
   'power pop']],
 'Accidents Will Happen': ['Elvis Costello & The Attractions',
  0.0441,
  0.608,
  0.597,
  0.00342,
  0.0276,
  0.285,
  -11.115,
  120.077,
  0.736,
  38,
  ['art rock',
   'heartland rock',
   'new romantic',
   'new wave',
   'new wave pop',
   'power pop']],
 'Alison': ['Elvis Costello',
  0.739,
  0.557,
  0.32,
  3.99e-05,
  0.0438,
  0.112,
  -10.792,
  176.647,
  0.377,
  56,
  ['new wave',
   'new wave pop',
   'permanent wave',
   'power

In [16]:
# Save tracks metadata to CSV
output_file = 'tracks_metadata.csv'
save_tracks_metadata_to_csv(extended_tracks_metadata, output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>