In [None]:
import spotipy
from credentials import cid, secret, username, password
import pandas as pd
import pymongo
import time
import tqdm

In [None]:
client_credentials_manager = spotipy.oauth2.SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager
                     = client_credentials_manager)


##### Setting up database connection with Mongo DB

In [None]:
client = pymongo.MongoClient("mongodb://localhost/")

In [None]:
db_mongo = client.song_data
songs_info = db_mongo.techno_info
songs_features = db_mongo.techno_features
songs_analysis = db_mongo.techno_analysis
analysis_bad_ids = db_mongo.bad_ids

In [None]:
#Get artists by genre
artists = pd.DataFrame()
search_string = 'genre:"techno"'
for i in tqdm.tqdm(range(0,2000,50)):
    artists = artists.append(sp.search(search_string, limit=50, offset=i, type='artist', market="DE"), ignore_index=True)

In [None]:
#Get albums from Artists
albums = pd.DataFrame()
for z in tqdm.tqdm(range(len(artists))):
    for i in range(len(artists.iloc[z]["artists"]["items"])):
        albums = albums.append(sp.artist_albums(artists.iloc[z]["artists"]["items"][i]["id"], limit=50), ignore_index=True)

In [None]:
#Get tracks from each album
songs = pd.DataFrame()
for z in tqdm.tqdm(range(len(albums))):
    for i in range(len(albums.iloc[z]["items"])):
        songs = songs.append(sp.album_tracks(albums.iloc[z]["items"][i]["id"], limit=100), ignore_index=True)
songs

In [None]:
#Get Song Ids from database
info_ids = []
query = songs_info.find({}, {"_id": 0, "id": 1})
for id in query:
    info_ids += id.values()
print(len(info_ids))

feature_ids = []
query = songs_features.find({}, {"_id": 0, "id": 1})
for id in query:
    feature_ids += id.values()
print(len(feature_ids))

analysis_ids = []
query = songs_analysis.find({}, {"_id": 0, "id": 1})
for id in query:
    analysis_ids += id.values()
print(len(analysis_ids))

bad_ids = []
query = analysis_bad_ids.find({}, {"_id": 0, "id": 1})
for id in query:
    bad_ids += id.values()
print(len(bad_ids))

final_ids_features = list(set(info_ids) - set(feature_ids))
print(len(final_ids_features))
final_ids_analysis = list(set(feature_ids) - set(analysis_ids) - set(bad_ids))
print(len(final_ids_analysis))

In [None]:

#Get song infos and write it in a mongoDB collection
info_df = pd.DataFrame()
print(len(set(song_ids)))
for i in range(0, len(set(song_ids)), 50):
    temp_ids = []
    if i % 1000 == 0:
        print(i)
    for id in set(song_ids[i:i+50]):
        temp_ids += [id]
    info_df = info_df.append(sp.tracks(temp_ids, market="DE")["tracks"], ignore_index=True)

songs_info.insert_many(info_df.to_dict('records'))



In [None]:
#Get song features and write it in a mongoDB collection

features_dict = []
features_df = pd.DataFrame()

for i in range(0, len(final_ids_features), 100):
    temp_ids = []
    if i % 1000 == 0:
        print(i)
    for id in final_ids_features[i:i+100]:
        temp_ids += [id]
    features_dict.append(sp.audio_features(temp_ids))
print(features_dict)

print("All features data collected")

for i in range(len(features_dict)):
    for z in features_dict[i]:
        if z == None:
            features_dict[i].remove(z)

for i in features_dict:
    features_df = features_df.append(i)

songs_features.insert_many(features_df.to_dict('records'));

print("All features data written in database")

In [None]:
# Remove duplicates from Database
feature_df = pd.DataFrame()
feature_df = pd.DataFrame(list(songs_features.find()))
feature_df.drop_duplicates("id", inplace=True)
songs_features.insert_many(feature_df.to_dict('records'))

In [None]:
#Get song analysis and write it in a mongoDB collection
analysis_df = pd.DataFrame()
i = 0
for z, id in tqdm.tqdm(enumerate(final_ids_analysis)):
    try:
        analysis_df = analysis_df.append(sp.audio_analysis(id)["track"], ignore_index=True)
        analysis_df.drop(['analysis_channels', 'analysis_sample_rate', 'code_version',
           'codestring', 'echoprint_version', 'echoprintstring', 'rhythm_version',
           'rhythmstring', 'sample_md5', 'synch_version',
           'synchstring'], axis=1, inplace=True)
        analysis_df.loc[i ,"id"] = id
        if i == 1000:
            songs_analysis.insert_many(analysis_df.to_dict('records'))
            analysis_df = pd.DataFrame()
            i = 0
        else:
            i += 1
    except:
        analysis_bad_ids.insert_one({"id":id, "time":time.time()})
        pass

print("All analysis data collected and written in database")

17908it [2:44:39,  2.65it/s]Max Retries reached
17943it [2:44:59,  2.97it/s]Max Retries reached
17977it [2:45:17,  3.14it/s]Max Retries reached
18085it [2:46:08,  2.80it/s]Max Retries reached
18138it [2:46:36,  2.84it/s]Max Retries reached
18173it [2:46:58,  2.72it/s]Max Retries reached
18212it [2:47:26,  2.31it/s]Max Retries reached
18216it [2:47:33,  1.02it/s]Max Retries reached
18397it [2:48:50,  2.79it/s]Max Retries reached
18554it [2:50:01,  3.10it/s]Max Retries reached
18577it [2:50:16,  2.06it/s]Max Retries reached
18578it [2:50:22,  2.01s/it]Max Retries reached
18647it [2:51:01,  2.60it/s]Max Retries reached
18802it [2:52:05,  2.15it/s]Max Retries reached
18818it [2:52:18,  2.58it/s]Max Retries reached
18873it [2:52:44,  2.65it/s]Max Retries reached
18884it [2:52:53,  2.93it/s]Max Retries reached
18905it [2:53:09,  3.23it/s]Max Retries reached
18928it [2:53:25,  2.04it/s]Max Retries reached
18941it [2:53:40,  1.96it/s]Max Retries reached
18996it [2:54:07,  2.30it/s]Max Retries 