# ADA 2018 -  “Happiness” Share it through music.

#### Andres Montero, Ariel Alba, Diego Iriarte




In [None]:
% matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import reverse_geocoder as rg
import os.path
import ast
import seaborn as sns

from helpers import *
from datetime import datetime, date, time
from scipy import stats

%load_ext autoreload
%autoreload 2


In [None]:
# Constants
DATA_DIR = './data/'
MUSIC_DIR = '{dir}{file}/'.format(dir=DATA_DIR, 
                                  file='fma_metadata')
PKL_DIR = '{dir}{file}/'.format(dir=DATA_DIR, 
                               file='pkl')

# True if we want to to execute the clean phase and to force 
# saving the cleaned file
CLEAN_PHASE = True
DEBUG = True


In [None]:
if CLEAN_PHASE:
    echonest_path = '{dir}{file}'.format(dir=MUSIC_DIR, 
                                         file='echonest.csv')
    features_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                         file='features.csv')
    genres_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                       file='genres.csv')
    tracks_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                       file='tracks.csv')
    

In [None]:
if CLEAN_PHASE:
    # Load datasets into pandas dataframes
    echonest_col_names=['track_id','danceability','energy',
                        'artist_latitude','artist_longitude',
                        'artist_name','artist_discovery',
                        'artist_family','artist_hotness',
                        'song_currency','song_hotness']
    
    echonest_dtypes = {'track_id': int, 'danceability': float, 
                       'energy': float, 'artist_latitude': float, 
                       'artist_longitude': float, 'artist_name': str,
                       'artist_discovery': float, 'artist_family': float,
                       'artist_hotness': float, 'song_currency': float,
                       'song_hotness': float}
    
    echonest_df = pd.read_csv(echonest_path, names=echonest_col_names,
                              header=3, dtype=echonest_dtypes, 
                              usecols=[0,2,3,11,13,14,21,22,23,24,25])

    genres_dtypes = {'genre_id': int, '#tracks': int, 
                     'parent': int, 'top_level': int}
    
    genres_df = pd.read_csv(genres_path, dtype=genres_dtypes)
    
    track_col_names = ['track_id', 'album_date_created',
                       'album_date_released', 'album_id',
                       'album_listens', 'album_title', 'artist_id',
                       'artist_name', 'track_duration', 'track_genre_top',
                       'track_language', 'track_listens', 'track_tags', 
                       'track_title']

    tracks_dtypes = {'track_id': int, 'album_date_created': str,
                     'album_date_released': str, 
                     'album_id': int, 'album_listens': int, 
                     'album_title': str, 'artist_id': int,
                     'artist_name': str, 'track_duration': int, 
                     'track_genre_top': str, 'track_language': str,
                     'track_listens': int, 'track_tags': str, 
                     'track_title': str}

    tracks_df = pd.read_csv(tracks_path, names=track_col_names,
                            header=2, usecols=[0, 2, 3, 6, 8, 11,
                                               21, 26, 38, 40, 45, 
                                               47, 51, 52])


In [None]:
# Echonest clean phase 
if CLEAN_PHASE:
    # Get city, state, country from artist longitude and latitude
    cities=[]
    states=[]
    countries=[]
    
    for i in range (0, len(echonest_df)):
        if np.isnan(echonest_df.artist_latitude[i]):
            city = np.nan
            state = np.nan
            country = np.nan
        else:
            coordinates = (echonest_df.artist_latitude[i], 
                           echonest_df.artist_longitude[i])
            results = rg.search(coordinates, mode=1)
            city = results[0]['name']
            state = results[0]['admin1']
            country = results[0]['cc']
        
        cities.append(city)
        states.append(state)
        countries.append(country)
    
    echonest_df.insert(loc=5, column='city', 
                       value=pd.Series(cities))
    echonest_df.insert(loc=6, column='state',
                       value=pd.Series(states))
    echonest_df.insert(loc=7, column='country', 
                       value=pd.Series(countries))
    

In [None]:
# Tracks clean phase
if CLEAN_PHASE:
    # Transforms str to datetime
    tracks_df['album_date_released'] = pd.to_datetime( \
                                            tracks_df['album_date_released'])
    tracks_df['album_date_created'] = pd.to_datetime( \
                                            tracks_df['album_date_created'])
    # Transform str to list
    tracks_df['track_tags'] = tracks_df['track_tags'].apply(lambda x: ast.literal_eval(x))


In [None]:
# if CLEAN_PHASE:
#     # Normalize dataframes to take out redundancy
#     # Select main columns to create new df
#     artists_df = tracks_df[['artist_id', 'artist_name']].copy()

#     albums_df = tracks_df[['album_id', 'album_date_created', 'album_date_released',
#                            'album_title', 'album_listens']].copy()

#     # Drop duplicates
#     artists_df = artists_df.drop_duplicates('artist_id')
#     albums_df = albums_df.drop_duplicates('album_id')
    
#     # Join echonist data to artist
#     echonest_tracks = echonest_df.merge(tracks_df, left_on='track_id', 
#                                         right_on='track_id')
    
#     print(echonest_tracks.describe())
    

In [None]:
# print('Artists size: {}'.format(len(artists_df)))
# if DEBUG: print('\nNaN count by column:\n{}' \
#                 .format(artists_df.isna().sum(axis=0)))
# artists_df.head()


In [None]:
# albums_df.head()

In [None]:
echonest_df

In [None]:
# Read  and wirte files depending on the existance of own path
echonest_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                        file='echonest_df.pkl')
genre_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                     file='genres_df.pkl')
tracks_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                      file='tracks_df.pkl')

if not CLEAN_PHASE or os.path.exists(echonest_df_path):
    echonest_df = pd.read_pickle(echonest_df_path)
else:
    echonest_df.to_pickle(echonest_df_path)
    
if not CLEAN_PHASE or os.path.exists(echonest_df_path):
    genres_df = pd.read_pickle(genre_df_path)
else:
    genres_df.to_pickle(genre_df_path)
    
if not CLEAN_PHASE or os.path.exists(tracks_df_path):
    tracks_df = pd.read_pickle(tracks_df_path)
else:
    tracks_df.to_pickle(tracks_df_path)
    

In [None]:
print('Echonest size: {}'.format(len(echonest_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(echonest_df.isna().sum(axis=0)))
echonest_df.head()


In [None]:
print('Genres size: {}'.format(len(genres_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(genres_df.isna().sum(axis=0)))
genres_df.head()


In [None]:
print('Tracks size: {}'.format(len(tracks_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(tracks_df.isna().sum(axis=0)))
tracks_df.tail()


#### First we will see the top 10 countries were the music is produced.

In [None]:
#Merge data to have the entire data frame
tracks_echonest = pd.merge(echonest_df, tracks_df, 
                           left_on='track_id', right_on='track_id', 
                           how='right')
tracks_echonest.head()


In [None]:
tracks_grouped = tracks_echonest.groupby(tracks_echonest['country']) \
                                .size()
tracks_country_top10 = tracks_grouped.sort_values(ascending=False) \
                                     .head(10)
tracks_country_top10.plot(kind='bar', 
                          title="Top 10 countries that produce tracks")
plt.ylabel('Number of Tracks')
plt.grid()


#### Top 10 albums

In [None]:
tracks_echonest['album_listens'] = tracks_echonest['album_listens'] \
                                     .apply(lambda x : neg_to_zero(x))
# tacks_echonest['album_listens']


In [None]:
albums = tracks_echonest.groupby(tracks_echonest['album_listens']) \
                        .size()
albums_top10 = albums.sort_values(ascending=False).head(10)
albums_top10.plot(kind='bar', title="Top 10 Albums")
plt.grid()


#### Top 10 Tracks

In [None]:
tracks_top10 = tracks_echonest.sort_values(by='track_listens',
                                           ascending=False).head(10)
plt.figure(figsize=(15, 8))
plt.title("Top 10 tracks listened")
plt.grid()
ax = sns.barplot(x='track_title', y= 'track_listens',
                 data=tracks_top10)


#### Relation between danceability and duration of the song

In [None]:
def cat_duration(quantity):
    """Function to clean inventory from raw data

    Args:
        quantity: Raw quantity.
    Returns:
        new_quantity: Official quantity.
    """
    
    if quantity <=2.0:
        return (1.0)
    elif quantity >2.0 & quantity<=4.0:
        return (2.0)
    else:
        return (3.0)


In [None]:
tracks_echonest.insert(loc=23, column='track_duration_minutes',
                       value=(tracks_echonest['track_duration'] / 60))


In [None]:
pd.to_numeric(tracks_echonest['track_duration_minutes'])
tracks_echonest['track_duration_minutes'] = tracks_echonest['track_duration_minutes'] \
                                                .apply(lambda x : np.rint(x))
sns.regplot(x='danceability', y='track_duration_minutes',
            data=tracks_echonest, ci=95, 
            line_kws = {'color': 'green'})


In [None]:
tracks_echonest.head()


In [None]:
spearman_coeff = stats.spearmanr(tracks_echonest['danceability'], 
                                 tracks_echonest['track_duration_minutes'])
spearman_coeff


In [None]:
genre_year = tracks_echonest[['track_genre_top', 'album_date_released']]
genre_year.insert(loc=2, column='album_released_year',
                  value=(genre_year['album_date_released'].dt.year))
genre_year.head()


In [None]:
top_genre = genre_year.groupby(genre_year['track_genre_top']).size()

In [None]:
top_genre_sorted = top_genre.sort_values(ascending=False).head(10)
top_genre_sorted.plot(kind='bar', title="Top 10 Genres")
plt.grid()
