In [None]:
import pandas as pd
import time
import json
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#### Spotify Authentication

In [None]:
key_file = '../keys.json'
with open(key_file) as f:
    keys = json.load(f)
    

client_credentials_manager = SpotifyClientCredentials(client_id=keys['spotify_client_id'],
                                                      client_secret=keys['spotify_client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
available_markets = ["AD","AR","AT","AU","BE","BG","BO","BR","CA","CH","CL","CO","CR","CY","CZ",
      "DE","DK","DO","EC","EE","ES","FI","FR","GB","GR","GT","HK","HN","HU","ID","IE","IL","IS",
      "IT","JP","LI","LT","LU","LV","MC","MT","MX","MY","NI","NL","NO","NZ","PA","PE","PH","PL",
      "PT","PY","RO","SE","SG","SK","SV","TH","TR","TW","US","UY","VN","ZA"]


def fix_ids(row):
    time.sleep(.01)
    if row['location'][-2:] in available_markets:
        try:
            query = spotify.search(q=row['artist'] + ' ' + row['title'], market=row['location'][-2:], type='track')
            for item in query['tracks']['items']:
                if item['id'][1:] == row['song_id']:
                    return item['id']
        except Exception:
            pass
    
    query = spotify.search(q=row['artist'] + ' ' + row['title'], market='US', type='track')
    for item in query['tracks']['items']:
            if item['id'][1:] == row['song_id']:
                return item['id']
            
    return np.nan


def fix_ids_brute(row):
    time.sleep(.5)
    for c in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
        try:
            result = spotify.track(c+row['song_id'])
            return result['id']
        except:
            pass
    return np.nan

### Generate IDs that need fixing

In [None]:
file_name = 'songs_2019-3-1_11-31-2.txt'
file_path = '../data/songs/bad_ids/'+file_name
output_file_path = '../data/songs/'+file_name

In [None]:
songs = pd.read_csv(file_path)
id_map = pd.read_csv('../data/id_map.txt')
songs_fix = songs.merge(right=id_map, how='left', on='song_id')
songs_fix.head()

In [None]:
print(len(songs_fix))

id_needed = songs_fix[songs_fix['id_fix'].isnull()].copy()
print(len(id_needed))
id_needed.drop_duplicates(subset=['song_id'], inplace=True)
print(len(id_needed))

### Use one of two methods to regenerate Spotify ID

In [None]:
id_needed['id_fix'] = id_needed.progress_apply(lambda row:fix_ids(row), axis=1)
# id_needed['id_fix'] = id_needed.progress_apply(lambda row:fix_ids_brute(row), axis=1)

In [None]:
print(len(id_needed))
id_needed.to_csv(path_or_buf='../data/id_needed.txt', index=False, encoding='utf-8')
still_missing = id_needed[id_needed['id_fix'].isnull()].copy()
print(len(still_missing))

In [None]:
id_needed.dropna(inplace=True, subset=['id_fix'])

In [None]:
id_map = pd.read_csv('data/id_map.txt')
print(len(id_map))
id_map = id_map.append(id_needed[['song_id', 'id_fix']], ignore_index=True)
print(len(id_map))

In [None]:
id_map.drop_duplicates(subset=['id_fix'], inplace=True)
id_map.dropna(subset=['id_fix'], inplace=True)
print(len(id_map))


In [None]:
id_map.to_csv(path_or_buf='./data/id_map.txt', index=False, encoding='utf-8')

### Merge fixed IDs from new ID Map into the songs file

In [None]:
songs = pd.read_csv(file_path)
id_map = pd.read_csv('../data/id_map.txt')

In [None]:
songs = songs.merge(right=id_map, how='left', on='song_id')
assert(len(songs[songs['id_fix'].isnull()]) == 0)
songs.head()

In [None]:
songs.drop(columns=['song_id'], inplace=True)
songs.rename(columns={'id_fix': 'song_id'}, inplace=True)
songs.head()

In [None]:
songs.to_csv(output_file_path, index=False)

### Replace Bad IDs

In [None]:
input_file_path = '../data/bad_ids/lyrics_with_lang.txt'
output_file_path = '../data/lyrics_with_lang.txt'

bad_ids = pd.read_csv(input_file_path)
id_map = pd.read_csv('../data/id_map.txt') 
len(id_map)

In [None]:
fixed = bad_ids.merge(right=id_map, how='left', on='song_id')

In [None]:
fixed.drop(columns=['song_id'], inplace=True)
fixed.rename(columns={'id_fix': 'song_id'}, inplace=True)
fixed.head()

In [None]:
print(len(fixed))
fixed.dropna(subset=['song_id'], inplace=True)
print(len(fixed))

In [None]:
fixed.to_csv(path_or_buf=output_file_path, index=False, encoding='utf-8')

In [None]:
bad_ids = pd.read_csv('../data/songs/bad_ids/songs_2019-3-17_12-31-9.txt')
bad_ids.drop_duplicates(subset=['song_id'], inplace=True)
id_map = pd.read_csv('../data/id_map.txt')

In [None]:
missing = bad_ids.merge(id_map, how='left', on='song_id')
missing = missing[missing['id_fix'].isnull()].copy()
len(missing)

### Use brute method to find IDs that are still missing

In [None]:
still_missing = missing[missing['id_fix'].isnull()].copy()
still_missing['id_fix'] = still_missing.progress_apply(lambda row:fix_ids_brute(row), axis=1)
still_missing

In [None]:
still_missing.to_csv('still_missing.csv')

In [None]:
new_id_map = id_map.append(still_missing[['song_id', 'id_fix']], ignore_index=True)

In [None]:
new_id_map.to_csv(path_or_buf='../data/id_map.txt', index=False, encoding='utf-8')