<a href="https://colab.research.google.com/github/ariel-hedvat/AdvancedMLDLCourseAssignments/blob/main/Experiments/data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TO-DO: Explain the purpose of this notebook

In [1]:
#installation for spotipy-API
!pip install spotipy



In [2]:
#imports
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import zipfile
import pandas as pd
import requests
import io
import subprocess
import time

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Function to read Spotify credentials from a URL
def read_spotify_credentials_from_url(url):
    credentials = {}
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.split('\n')
        for line in lines:
            if '=' in line:
                key, value = line.strip().split('=')
                credentials[key] = value
    else:
        print("Failed to fetch Spotify credentials. Status code:", response.status_code)
    return credentials

In [6]:
# Function to unzip the dataset
def unzip_dataset(zip_file_url):
    response = requests.get(zip_file_url)
    with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
        extracted_files = {}
        for file in zip_ref.namelist():
          if file.endswith('.csv'):
                extracted_files[file] = pd.read_csv(io.BytesIO(zip_ref.read(file)), encoding='utf-8')
    return extracted_files

#The cols here: user_id, artistname, trackname, playlistname

In [12]:
# Get 3 users
def get_3_user_ids(extracted_files):
    user_ids = set()
    for df in extracted_files.values():
        if not df.empty:
            user_ids.update(df['user_id'].unique())
            if len(user_ids) >= 3:
                break
    return list(user_ids)[:3]

# Data for 3 users
def filter_data_for_users(extracted_files):
    # Get 3 user IDs
    user_ids = get_3_user_ids(extracted_files)

    # Filter data for the 3 user IDs
    filtered_data = {}
    for filename, df in extracted_files.items():
        filtered_df = df[df['user_id'].isin(user_ids)]
        filtered_data[filename] = filtered_df
    return filtered_data

In [28]:
from google.colab import files

def save_tracks_metadata_to_csv(three_users_data, output_file):
    # Concatenate DataFrames from three_users_data
    combined_df = pd.concat(three_users_data.values(), ignore_index=True)

    # Save DataFrame to CSV file
    combined_df.to_csv(output_file, index=False)

    # Download the CSV file
    files.download(output_file)

In [13]:
# Unzip the dataset
zip_file_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/full_spotify_dataset_recreated.zip'
#zip_file_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/50.zip'
data_files = unzip_dataset(zip_file_url)

In [14]:
three_users_data = filter_data_for_users(data_files)

In [15]:
data_files

{'spotify_dataset_recreated.csv':                                   user_id                        artistname  \
 0        9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 1        9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 2        9cc0cfd4d7d7885102480dd99e7a90d6                      Tiffany Page   
 3        9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 4        9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 ...                                   ...                               ...   
 1048570  576820c2da503406e16ae9782b3c0e4c                          Dr. John   
 1048571  576820c2da503406e16ae9782b3c0e4c                    Punch Brothers   
 1048572  576820c2da503406e16ae9782b3c0e4c                    Chicago Farmer   
 1048573  576820c2da503406e16ae9782b3c0e4c               Henrik Freischlader   
 1048574  576820c2da503406e16ae9782b3c0e4c                    Curtis Harding   
 
     

In [16]:
three_users_data

{'spotify_dataset_recreated.csv':                                   user_id               artistname  \
 561622   435faeede7f140d0ab8b4f7765558ecf              Terry Malts   
 561623   435faeede7f140d0ab8b4f7765558ecf            The Soft Moon   
 561624   435faeede7f140d0ab8b4f7765558ecf  Queens Of The Stone Age   
 561625   435faeede7f140d0ab8b4f7765558ecf             Julia Holter   
 561626   435faeede7f140d0ab8b4f7765558ecf          Crystal Castles   
 ...                                   ...                      ...   
 1011821  eecb23cd4578482fdd624fd2360256a1             Foo Fighters   
 1011822  eecb23cd4578482fdd624fd2360256a1             Foo Fighters   
 1011823  eecb23cd4578482fdd624fd2360256a1             Foo Fighters   
 1011824  eecb23cd4578482fdd624fd2360256a1             Foo Fighters   
 1011825  eecb23cd4578482fdd624fd2360256a1             Foo Fighters   
 
                  trackname  playlistname  
 561622   Can't Tell No One     AMPpod 12  
 561623            Die Li

In [29]:
# save the three users data to csv
output_file = 'three_users_data.csv'
save_tracks_metadata_to_csv(three_users_data, output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
# Function to extract track names and artist names into a dictionary
def extract_tracks_metadata(data_files):
    tracks_metadata = {}
    for df_name, df in data_files.items():
        print(f"Processing DataFrame: {df_name}")
        # Strip leading and trailing spaces from column names
        df.columns = df.columns.str.strip()
        print(f"Columns in DataFrame: {df.columns}")
        if 'trackname' in df.columns and 'artistname' in df.columns:
            print("Both 'trackname' and 'artistname' columns found.")
            for index, row in df.iterrows():
                track_name = row['trackname']
                artist_name = row['artistname']
                print(f"Extracting track: {track_name}, Artist: {artist_name}")
                if track_name not in tracks_metadata:
                    tracks_metadata[track_name] = artist_name
                else:
                    print(f"Track '{track_name}' already exists in metadata.")
        else:
            print("Either 'trackname' or 'artistname' columns missing in DataFrame.")
    return tracks_metadata

    #{trackname: artistname}


In [18]:
def get_track_metadata_from_api(tracks_metadata, sp):
    extended_tracks_metadata = {}
    start_time = time.time()  # Record start time
    tracks_added = 0  # Initialize tracks counter
    tracks_not_found = 0  # Initialize counter for tracks not found
    for index, (track_name, artist_name) in enumerate(tracks_metadata.items(), start=1):
        # Search for the track
        try:
            results = sp.search(q=f"track:{track_name} artist:{artist_name}", type='track', limit=1)
            if results['tracks']['items']:
                track_info = results['tracks']['items'][0]
                artist_uri = track_info["artists"][0]["uri"]
                artist_info = sp.artist(artist_uri)
                track_features = sp.audio_features(track_info['uri'])[0]

                # Extract release date
                release_date = track_info.get('album', {}).get('release_date', '')

                # Create a list with all the metadata
                metadata_list = [
                    artist_name,
                    track_features['acousticness'],
                    track_features['danceability'],
                    track_features['energy'],
                    track_features['instrumentalness'],
                    track_features['speechiness'],
                    track_features['liveness'],
                    track_features['loudness'],
                    track_features['tempo'],
                    track_features['valence'],
                    track_info['popularity'],
                    release_date,
                    artist_info["genres"]
                ]

                # Add the metadata list to the extended_tracks_metadata dictionary
                extended_tracks_metadata[track_name] = metadata_list
                print(f"{index}. Extended metadata for track '{track_name}': {metadata_list}")
                tracks_added += 1
            else:
                print(f"{index}. No results found for track '{track_name}'")
                tracks_not_found += 1
        except sp.SpotifyException as e:
            if e.http_status == 429:  # Too Many Requests
                print("Rate limit exceeded (429). Waiting before retrying...")
                time.sleep(10)  # Wait for 10 seconds before retrying
            else:
                print(f"Error while fetching track metadata: {e}")

    # Print total elapsed time and final index
    elapsed_time = time.time() - start_time
    print(f"\n---------------------------------------------------------------")
    print(f"Total elapsed time: {elapsed_time:.2f} seconds")
    print(f"Final index: {index}")
    print(f"Songs found: {tracks_added}")
    print(f"Songs not found: {tracks_not_found}")
    return extended_tracks_metadata


 #{trackname: [artistname, acousticness, danceability, energy, instrumentalness, speechiness, liveness, loudness, tempo, valence, popularity, Release Date, genres]}

In [19]:
from google.colab import files

# Function to save tracks metadata as a CSV file
def save_tracks_metadata_to_csv(extended_tracks_metadata, output_file):
    # Create a DataFrame from tracks_metadata dictionary
    df = pd.DataFrame.from_dict(extended_tracks_metadata, orient='index')

    # Reset index and rename columns
    df.reset_index(inplace=True)
    df.columns = ['Track Name', 'Artist Name', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness',
                  'Speechiness', 'Liveness', 'Loudness', 'Tempo', 'Valence', 'Popularity', 'Release Date', 'Genres']

    # Save DataFrame to CSV file
    df.to_csv(output_file, index=False)

    # Download the CSV file
    files.download(output_file)

In [20]:
# Extract track metadata
tracks_metadata = extract_tracks_metadata(three_users_data)

Processing DataFrame: spotify_dataset_recreated.csv
Columns in DataFrame: Index(['user_id', 'artistname', 'trackname', 'playlistname'], dtype='object')
Both 'trackname' and 'artistname' columns found.
Extracting track: Can't Tell No One, Artist: Terry Malts
Extracting track: Die Life, Artist: The Soft Moon
Extracting track: First It Giveth, Artist: Queens Of The Stone Age
Extracting track: In the Same Room, Artist: Julia Holter
Extracting track: Kerosene, Artist: Crystal Castles
Extracting track: Lacker Boo, Artist: Portico Quartet
Extracting track: Look At These Hoes, Artist: Santigold
Extracting track: Scavenger, Artist: School Of Seven Bells
Extracting track: Solar Flare, Artist: Stumbleine
Extracting track: The Power - 7 Version", Artist: SNAP!
Extracting track: Yr Love, Artist: Holy Other
Extracting track: Drops, Artist: Iamamiwhoami
Extracting track: Goatlord, Artist: Goat
Extracting track: Roulette Dares (The Haunt Of), Artist: The Mars Volta
Extracting track: Rumble, Artist: Li

In [21]:
tracks_metadata

{"Can't Tell No One": 'Terry Malts',
 'Die Life': 'The Soft Moon',
 'First It Giveth': 'Queens Of The Stone Age',
 'In the Same Room': 'Julia Holter',
 'Kerosene': 'Crystal Castles',
 'Lacker Boo': 'Portico Quartet',
 'Look At These Hoes': 'Santigold',
 'Scavenger': 'School Of Seven Bells',
 'Solar Flare': 'Stumbleine',
 'The Power - 7 Version"': 'SNAP!',
 'Yr Love': 'Holy Other',
 'Drops': 'Iamamiwhoami',
 'Goatlord': 'Goat',
 'Roulette Dares (The Haunt Of)': 'The Mars Volta',
 'Rumble': 'Link Wray & His Ray Men',
 'Star of the Age': 'Shearwater',
 'Synthetica': 'Metric',
 'The Perfect Dancer': 'Gallon Drunk',
 'Tightrope - feat. Big Boi': 'Janelle Monáe',
 'Time Without Question - Original Mix': 'Ghosting Season',
 'Unfinished Sympathy - 2006 Digital Remaster': 'Massive Attack',
 'WIXIW': 'Liars',
 'As': 'Stevie Wonder',
 'Cocoon': 'Björk',
 'Digital Love': 'Daft Punk',
 'Everlong': 'Foo Fighters',
 'Heartbeats': 'The Knife',
 'Into My Arms - 2011 - Remaster': 'Nick Cave & The Bad Se

In [22]:
# Read Spotify credentials from URL
spotify_credentials_url = 'https://raw.githubusercontent.com/ariel-hedvat/AdvancedMLDLCourseAssignments/main/Experiments/Data/eitans_spotify_credentials.txt'
spotify_credentials = read_spotify_credentials_from_url(spotify_credentials_url)

# Initialize Spotipy client with the obtained credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=spotify_credentials['CLIENT_ID'],
                                                           client_secret=spotify_credentials['CLIENT_SECRET']))

In [23]:
# Assuming tracks_metadata is already populated with track names and artist names
# Call the function to retrieve metadata for each track
extended_tracks_metadata = get_track_metadata_from_api(tracks_metadata, sp)

1. Extended metadata for track 'Can't Tell No One': ['Terry Malts', 0.000723, 0.436, 0.939, 0.531, 0.0417, 0.0613, -3.372, 105.15, 0.518, 5, '2012-02-21', ['garage pop', 'la indie']]
2. Extended metadata for track 'Die Life': ['The Soft Moon', 0.000303, 0.577, 0.987, 0.948, 0.0789, 0.327, -5.144, 155.0, 0.326, 21, '2012-10-30', ['dark post-punk', 'grave wave', 'nu gaze']]
3. Extended metadata for track 'First It Giveth': ['Queens Of The Stone Age', 0.0211, 0.448, 0.966, 0.0595, 0.101, 0.135, -4.029, 107.293, 0.38, 56, '2002', ['alternative metal', 'alternative rock', 'grunge', 'modern rock', 'palm desert scene', 'rock', 'stoner metal', 'stoner rock']]
4. Extended metadata for track 'In the Same Room': ['Julia Holter', 0.425, 0.46, 0.327, 0.0612, 0.0287, 0.0886, -10.309, 109.253, 0.108, 25, '2012-10-26', ['art pop', 'chamber pop', 'electra', 'experimental pop', 'la indie', 'spectra']]
5. Extended metadata for track 'Kerosene': ['Crystal Castles', 0.0827, 0.519, 0.546, 0, 0.0763, 0.133, 

In [24]:
extended_tracks_metadata

{"Can't Tell No One": ['Terry Malts',
  0.000723,
  0.436,
  0.939,
  0.531,
  0.0417,
  0.0613,
  -3.372,
  105.15,
  0.518,
  5,
  '2012-02-21',
  ['garage pop', 'la indie']],
 'Die Life': ['The Soft Moon',
  0.000303,
  0.577,
  0.987,
  0.948,
  0.0789,
  0.327,
  -5.144,
  155.0,
  0.326,
  21,
  '2012-10-30',
  ['dark post-punk', 'grave wave', 'nu gaze']],
 'First It Giveth': ['Queens Of The Stone Age',
  0.0211,
  0.448,
  0.966,
  0.0595,
  0.101,
  0.135,
  -4.029,
  107.293,
  0.38,
  56,
  '2002',
  ['alternative metal',
   'alternative rock',
   'grunge',
   'modern rock',
   'palm desert scene',
   'rock',
   'stoner metal',
   'stoner rock']],
 'In the Same Room': ['Julia Holter',
  0.425,
  0.46,
  0.327,
  0.0612,
  0.0287,
  0.0886,
  -10.309,
  109.253,
  0.108,
  25,
  '2012-10-26',
  ['art pop',
   'chamber pop',
   'electra',
   'experimental pop',
   'la indie',
   'spectra']],
 'Kerosene': ['Crystal Castles',
  0.0827,
  0.519,
  0.546,
  0,
  0.0763,
  0.133,
  

In [25]:
# Save tracks metadata to CSV
output_file = 'extended_tracks_metadata.csv'
save_tracks_metadata_to_csv(extended_tracks_metadata, output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>