# Data 

## 1. Data Gathering

We connect to the spotifi API (spotipy) in order to retreive the dataset for our work. 
Let's go step by step into this process

In [13]:
import pip 
import os

# Only package not included with python
try : 
    import spotipy
except ImportError : 
    pip.main(['install', 'spotipy'])
    import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import numpy as np 
import pandas as pd

In [None]:
# Set this to false if already have the CSVs ready
save_csv = False

# If flag set to false then csvs are loaded from cvs_dir_name searching for csvs with name "genre.csv" where the genres are specified in the cell below§
grab_from_api = False
csv_dir_name = 'csvs' 

# IF true after cleaning the dataset combine all the frames into a single one and save them into all_clean_path
save_all_clean = True
all_clean_path = 'dataframeV1.csv'

# Save in separate files after cleaning, usefull for the first analysys part
save_separate = True

### Genres selection

We decided to focus on classic, jazz, metal and rap as we beleive those are genres that greatly differs between each other, while at the same time sharing some common gray zones

In [None]:
genres = ['classic', 'jazz', 'metal', 'rap']

Loading spotify public and private key, and authentication

In [14]:
credentials = {
    'public' : 'e2b7e92cf8684577a314a8804b97337a', 
    'private': 'a847df678a5145d0a62381b255e4e4fd'
    }

client_credentials_manager = SpotifyClientCredentials(client_id=credentials['public'], client_secret=credentials['private'])
spotyCarlo = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Interaction with the API

In [None]:
#TODO : Clean this function 
def createFrameFromUrl(url,  carlo = spotyCarlo) : 

    URI = url.split("/")[-1].split("?")[0]
    offs = 0
    feats = list()
    
    # Spotify Limit the number of items that it can be sent in a request to 100 so we have to loop adding offset untill empty body in response
    while True:
        # NOTE:  playlist_track method works only wor playlists made by users ... not genres !!!!
        track_uris = [x["track"]["uri"] for x in carlo.playlist_tracks(URI,offset=offs)["items"]]
        # Empty body
        if track_uris == [] : 
            break
        feats += carlo.audio_features(track_uris)
        offs += 100
    return pd.DataFrame(feats)


In [None]:
# We then extract the dataframes for each genre and for each playlist of the genres and save them into a dictionary. so it is easy to concatenate them
if grab_from_api == True:
    dF_dict = {x : [] for x in genres}
    for genre in genres :     
        with open(os.path.join('urls', ''.join((genre, '_url.txt'))), 'r') as f:
            for uri in f.readlines():
                uri = uri.strip()
                try:
                    dF_dict[genre].append(createFrameFromUrl(uri))
                except : 
                    print(f"failed on {genre}")
                    
        # This will raise error if nothing were retrieved
        dF_dict[genre] = pd.concat([x for x in dF_dict[genre]])
        if save_csv : 
            dF_dict[genre].to_csv(os.path.join(('csvs', genre + '.csv')))
# IF cvs already exist
else: 
    dF_dict = {genre : pd.read_csv(os.path.join(csv_dir_name, genre + '.csv'), index_col=0) for genre in genres}
    

## 2 Cleaning

In [None]:
# Now we clean the dataset from duplicates and missing values, moreover we add the label column for the genre
def cleanFrame(dF, col_to_drop = ['track_href', 'analysis_url', 'type'], genre = '') : 
    report = {
        'duplicate_found' : 0, 
        'NaN_found' : 0, 
        'number_of_instances' : 0,
        'labeled' : False
    }

    i_obs, _ = dF.shape

    # Dropping columns we do not need
    fin = dF.drop(col_to_drop, axis = 1) 

    fin = fin.drop_duplicates(subset='id', keep = 'first')
    d_obs, _ = fin.shape
    report['duplicate_found'] = i_obs - d_obs

    fin = fin.dropna(how = 'any')
    n_obs, _ = fin.shape
    report['NaN_found'] = d_obs - n_obs

    report['number_of_instances'] = n_obs

    if genre != '' : 
        y  = pd.DataFrame([genre for _ in range(n_obs)], columns=['genre'])
        fin['label'] = genre
        report['labeled'] = True

    return fin, report


In [None]:
report = {}


for gen in genres : 
    dF_dict[gen], report[gen] = cleanFrame(dF_dict[gen], genre=gen)
    if save_separate : 
        dF_dict[gen].to_csv(gen + '_cleaned.csv')

if save_all_clean :
    merged = pd.concat([dF_dict[x] for x in genres])
    merged.to_csv(all_clean_path)

    

In [None]:
for rep in report : 
    print(f"Report of {rep}:\n\t {report[rep]}\n" + "-"*100)

In [None]:
dF = pd.read_csv('.\csvs\Top50_clean.csv', index_col=0)

def addPop(id):
    return spotyCarlo.track(id)['popularity']

dF.insert(15,'popularity',np.array(list(map(addPop,dF.iloc[:,11]))))
dF.to_csv('.\csvs\Top50_clean.csv')


In [None]:
dF = pd.read_csv('.\csvs\dataframeV2.csv', index_col=0)
dF.head()

In [25]:
dF = pd.read_csv('.\csvs\dataframeV2.csv', index_col=0)
def addFollwers(id):
    ArtId = spotyCarlo.track(id)['artists'][0]['id']
    return spotyCarlo.artist(ArtId)['followers']['total']
dF.insert(17,'artist_followers',np.array(list(map(addFollwers,dF.iloc[:,11]))))

In [26]:
dF.to_csv('.\csvs\dataframeV2.csv')