In [10]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import os
from spotipy.oauth2 import SpotifyClientCredentials
from fuzzywuzzy import fuzz
import dotenv

# load environment variables
dotenv.load_dotenv()

# constants
data_path = '../../raw_data/'

# Get credentials from environment variables
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

In [11]:
songs_with_attributes_and_lyrics = pd.read_csv(data_path + 'camille/songs_with_attributes_and_lyrics.csv', nrows=10000)
# songs_with_lyrics_and_timestamps = pd.read_csv(data_path + 'camille/songs_with_lyrics_and_timestamps.csv')
songs_with_attributes_and_lyrics

Unnamed: 0,id,name,album_name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,lyrics
0,0Prct5TDjAnEgIqbxcldY9,!,UNDEN!ABLE,['HELLYEAH'],0.415,0.6050,7,-11.157,1,0.0575,0.00116,0.838000,0.4710,0.193,100.059,79500.0,"He said he came from Jamaica,\n he owned a cou..."
1,2ASl4wirkeYm3OWZxXKYuq,!!,,Yxngxr1,0.788,0.6480,7,-9.135,0,0.3150,0.90000,0.000000,0.1760,0.287,79.998,114000.0,"Fucked a bitch, now she running with my kids\n..."
2,69lcggVPmOr9cvPx9kLiiN,!!! - Interlude,Where I Belong EP,['Glowie'],0.000,0.0354,7,-20.151,0,0.0000,0.90800,0.000000,0.4790,0.000,0.000,11413.0,"Oh, my God, I'm going crazy\n"
3,4U7dlZjg1s9pjdppqZy0fm,!!De Repente!!,Un Palo Al Agua (20 Grandes Canciones),['Rosendo'],0.657,0.8820,5,-6.340,1,0.0385,0.00740,0.000013,0.0474,0.939,123.588,198173.0,Continuamente se extraña la gente si no puede ...
4,4v1IBp3Y3rpkWmWzIlkYju,!!De Repente!!,Fuera De Lugar,['Rosendo'],0.659,0.8930,5,-8.531,1,0.0411,0.09220,0.000019,0.0534,0.951,123.600,199827.0,Continuamente se extraña la gente si no puede ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1sULdzB6vSo8XtJKNJG67w,7.62,HIStory,['YFN Lucci'],0.769,0.6670,2,-6.562,1,0.5210,0.12800,0.000000,0.1100,0.329,158.059,172449.0,Pipe that shit up TnT\n Dmac on the fuckin' tr...
9996,4xTx0HsUFdGoZzjziLfuG5,7.62,Cycle of Zero,['Divercia'],0.250,0.8890,4,-7.235,0,0.0585,0.00434,0.848000,0.1270,0.159,115.976,292107.0,"(Seven... Sixty-two...)\n ""Escaping reality as..."
9997,62Yj1WX2JuGDKhslyN490y,7.62,"HIStory, Lost Pages",['YFN Lucci'],0.769,0.6670,2,-6.562,1,0.5210,0.12800,0.000000,0.1100,0.329,158.059,172449.0,Pipe that shit up TnT\n Dmac on the fuckin' tr...
9998,7w77k7yCaDOMe25fYxnzit,7.62 God,,Pooh Shiesty,0.819,0.6510,10,-5.859,0,0.2910,0.00861,0.000000,0.3780,0.522,77.013,177110.0,Once again I'm locked in with TP\n We finna ma...


In [12]:
def find_song_album(song_name, artist_name):
    """
    Finds the best matching song and album for a given song name and artist name using Spotify's API.

    Parameters:
    - song_name (str): The name of the song to search for.
    - artist_name (str): The name of the artist to search for.
    - client_id (str): Spotify API client ID.
    - client_secret (str): Spotify API client secret.

    Returns:
    - dict: A dictionary containing the best match's song name, artist name, album name, release date, and match score.
    - None: If no match is found with a score above the threshold.
    """
    
    # Initialize Spotify client with provided credentials
    client_credentials_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    
    # Construct the search query for Spotify API
    query = f"track:{song_name} artist:{artist_name}"
    # Perform the search with a limit of 50 results
    results = sp.search(q=query, type='track', limit=50)
    
    best_match = None
    highest_score = 0
    
    # Iterate over each track in the search results
    for track in results['tracks']['items']:
        # Calculate the similarity score for the song name
        song_score = fuzz.ratio(song_name.lower(), track['name'].lower())
        # Calculate the similarity score for the artist name
        artist_score = fuzz.ratio(artist_name.lower(), track['artists'][0]['name'].lower())
        
        # Calculate a combined score with weighted contributions from song and artist scores
        combined_score = (song_score * 0.6) + (artist_score * 0.4)
        
        # Update the best match if the current track has a higher combined score
        if combined_score > highest_score:
            highest_score = combined_score
            best_match = track
    
    # Check if a suitable match was found with a score above the threshold
    if best_match and highest_score > 70:  # Threshold for considering a match
        return {
            'song': best_match['name'],
            'artist': best_match['artists'][0]['name'],
            'album': best_match['album']['name'],
            'release_date': best_match['album']['release_date'],
            'match_score': highest_score
        }
    else:
        # Return None if no match meets the threshold criteria
        return None

In [20]:
import warnings
from tqdm import tqdm

# List to store indices of rows that were skipped
skipped_indices = []

# Function to apply to each row
def fill_album_name(row):
    if pd.isna(row['album_name']):
        try:
            result = find_song_album(row['name'], row['artists'][0])
            if result:
                return result['album']
        except Exception as e:
            warnings.warn(f"Skipping row at index {row.name} due to error: {e}")
            skipped_indices.append(row.name)
    return row['album_name']

# Apply the function to the DataFrame with tqdm progress bar
tqdm.pandas()
songs_with_attributes_and_lyrics['album_name'] = songs_with_attributes_and_lyrics.progress_apply(fill_album_name, axis=1)

songs_with_attributes_and_lyrics.head()