# ADA 2018 -  “Happiness” Share it through music.

#### Andres Montero, Ariel Alba, Diego Iriarte




In [None]:
% matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import reverse_geocoder as rg
import os.path
import ast
import seaborn as sns

from helpers import *
from datetime import datetime, date, time
from scipy import stats

%load_ext autoreload
%autoreload 2


In [None]:
# Constants
DATA_DIR = './data/'
MUSIC_DIR = '{dir}{file}/'.format(dir=DATA_DIR, 
                                  file='fma_metadata')
PKL_DIR = '{dir}{file}/'.format(dir=DATA_DIR, 
                               file='pkl')

# True if we want to to execute the clean phase and to force 
# saving the cleaned file
CLEAN_PHASE = True
DEBUG = True


In [None]:
if CLEAN_PHASE:
    echonest_path = '{dir}{file}'.format(dir=MUSIC_DIR, 
                                         file='echonest.csv')
    features_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                         file='features.csv')
    genres_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                       file='genres.csv')
    tracks_path = '{dir}{file}'.format(dir=MUSIC_DIR,
                                       file='tracks.csv')
    

In [None]:
if CLEAN_PHASE:
    # Load datasets into pandas dataframes
    echonest_col_names=['track_id', 'danceability', 'energy',
                        'valence', 'artist_latitude',
                        'artist_longitude', 'artist_name',
                        'artist_discovery', 'artist_family',
                        'artist_hotness', 'song_currency', 
                        'song_hotness']
    
    echonest_dtypes = {'track_id': int, 'danceability': float, 
                       'energy': float, 'valence': float,
                       'artist_latitude': float, 'artist_longitude': float,
                       'artist_name': str, 'artist_discovery': float, 
                       'artist_family': float, 'artist_hotness': float,
                       'song_currency': float, 'song_hotness': float}
    
    echonest_df = pd.read_csv(echonest_path, names=echonest_col_names,
                              header=3, dtype=echonest_dtypes, 
                              usecols=[0, 2, 3, 8, 11, 13, 14,
                                       21, 22, 23, 24, 25])

    genres_dtypes = {'genre_id': int, '#tracks': int, 
                     'parent': int, 'top_level': int}
    
    genres_df = pd.read_csv(genres_path, dtype=genres_dtypes)
    
    track_col_names = ['track_id', 'album_date_created',
                       'album_date_released', 'album_id',
                       'album_listens', 'album_title', 
                       'artist_id', 'artist_latitude',
                       'artist_longitude', 'artist_name',
                       'track_duration', 'track_genre_top',
                       'track_genres_all', 'track_language', 
                       'track_listens', 'track_tags', 
                       'track_title']

    tracks_dtypes = {'track_id': int, 'album_date_created': str,
                     'album_date_released': str, 
                     'album_id': int, 'album_listens': int, 
                     'album_title': str, 'artist_id': int,
                     'artist_latitude': float, 'artist_longitude': float,
                     'artist_name': str, 'track_duration': int, 
                     'track_genre_top': str, 'track_genres_all': str, 
                     'track_language': str, 'track_listens': int,
                     'track_tags': str, 'track_title': str}

    tracks_df = pd.read_csv(tracks_path, names=track_col_names,
                            header=2, usecols=[0, 2, 3, 6, 8, 11,
                                               21, 22, 24, 26, 38, 
                                               40, 41, 45, 47, 51,
                                               52])


In [None]:
# Tracks clean phase
if CLEAN_PHASE:
    # Transforms str to datetime
    tracks_df['album_date_released'] = pd.to_datetime( \
                                            tracks_df['album_date_released'])
    tracks_df['album_date_created'] = pd.to_datetime( \
                                            tracks_df['album_date_created'])
    # Transform str to list
    tracks_df['track_tags'] = tracks_df['track_tags'] \
                                .apply(lambda x: ast.literal_eval(x))
    tracks_df['track_genres_all'] = tracks_df['track_genres_all'] \
                                .apply(lambda x: ast.literal_eval(x))


In [None]:
if CLEAN_PHASE:
    # Normalize dataframes to take out redundancy
    
    # Join echonist data to tracks
    echo_tracks = echonest_df.merge(tracks_df, left_on='track_id', 
                                    right_on='track_id', how='outer')
    
    print(len(echo_tracks))
     
    # Select main columns to create new df
    artists_df = echo_tracks[['artist_id', 'artist_name_x',
                              'artist_latitude_x', 'artist_longitude_x', 
                              'artist_discovery', 'artist_family', 
                              'artist_hotness', 'artist_latitude_y',
                              'artist_longitude_y', 'artist_name_y']].copy()

    albums_df = echo_tracks[['album_id', 'album_date_created',
                             'album_date_released', 'album_title',
                             'album_listens']].copy()
    
    tracks_df = echo_tracks[['track_id', 'track_title', 
                             'track_duration', 'artist_id', 
                             'album_id', 'track_genre_top',
                             'track_genres_all','track_language',
                             'track_listens', 'track_tags',
                             'danceability', 'energy',
                             'valence', 'song_currency',
                             'song_hotness']].copy()
    
    # Drop duplicates
    artists_df = artists_df.drop_duplicates('artist_id')
    albums_df = albums_df.drop_duplicates('album_id')
    
    # Reset Index
    artists_df = artists_df.reset_index(drop=True)
    albums_df = albums_df.reset_index(drop=True)
    
    # Clean listen count
    albums_df['album_listens'] = albums_df['album_listens'] \
                                    .apply(lambda x : neg_to_zero(x))
    
    tracks_df['track_listens'] = tracks_df['track_listens'] \
                                    .apply(lambda x : neg_to_zero(x))
    
    # Artist name different on echonest and FullMusicArchive
    if DEBUG:
        name_comp = not_eq_ign_case(artists_df['artist_name_x'],
                                    artists_df['artist_name_y'])
        artist_name_diff = artists_df[name_comp]
        
        print('# Different artist names: {}\n' \
                  .format(len(artist_name_diff)))

        print('Example:\n{}'.format(artist_name_diff[['artist_name_x',
                                                      'artist_name_y']].head(3)))


In [None]:
# Artist location different on echonest and FullMusicArchive
if DEBUG and CLEAN_PHASE:
    lat_comp = artists_df['artist_latitude_x'] != artists_df['artist_latitude_y']
    long_comp = artists_df['artist_longitude_x'] != artists_df['artist_longitude_y']
    latlong_comp = lat_comp | long_comp
    latlong_diff = artists_df[latlong_comp]

    print('# Different Lat Long values: {}\n' \
              .format(len(latlong_diff)))

    print('Example:\n{}'.format(artist_name_diff[['artist_latitude_x',
                                                  'artist_latitude_y',
                                                  'artist_longitude_x',
                                                  'artist_longitude_y']].head(3)))

In [None]:
# Echonest clean phase 
if CLEAN_PHASE:
    # Get city, state, country from artist longitude and latitude
    cities=[]
    states=[]
    countries=[]
    
    for i in range (0, len(artists_df)):
        if np.isnan(artists_df.artist_latitude_y[i]):
            city = np.nan
            state = np.nan
            country = np.nan
        else:
            coordinates = (artists_df.artist_latitude_y[i], 
                           artists_df.artist_longitude_y[i])
            results = rg.search(coordinates, mode=1)
            city = results[0]['name']
            state = results[0]['admin1']
            country = results[0]['cc']
        
        cities.append(city)
        states.append(state)
        countries.append(country)
    
    artists_df.insert(loc=5, column='city', 
                      value=pd.Series(cities))
    artists_df.insert(loc=6, column='state',
                      value=pd.Series(states))
    artists_df.insert(loc=7, column='country', 
                      value=pd.Series(countries))
    

In [None]:
# Read  and wirte files depending on the existance of own path
echonest_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                        file='echonest_df.pkl')
genres_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                     file='genres_df.pkl')
albums_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                     file='albums_df.pkl')
artists_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                     file='artists_df.pkl')
tracks_df_path = '{dir}{file}'.format(dir=PKL_DIR, 
                                      file='tracks_df.pkl')

if CLEAN_PHASE:
    genres_df.to_pickle(genres_df_path)
elif os.path.exists(echonest_df_path):
    genres_df = pd.read_pickle(genres_df_path)
else:
    print('There is no genres pandas data')
    
if CLEAN_PHASE:
    albums_df.to_pickle(albums_df_path)
elif os.path.exists(albums_df_path):
    albums_df = pd.read_pickle(albums_df_path)
else:
    print('There is no albums pandas data')
    
if CLEAN_PHASE:
    artists_df.to_pickle(artists_df_path)
elif os.path.exists(artists_df_path):
    artists_df = pd.read_pickle(artists_df_path)
else:
    print('There is no albums pandas data')
    
if CLEAN_PHASE:
    tracks_df.to_pickle(tracks_df_path)
elif os.path.exists(tracks_df_path):
    tracks_df = pd.read_pickle(tracks_df_path)
else:
    print('There is no albums tracks data')
    

In [None]:
# Data from Spitfy and LastFM API with updated information
YEARS = [2017, 2018]
dfs = {}

for year in YEARS:
    albums_year_path = '{dir}albums_{year}_df.pkl'.format(dir=PKL_DIR,
                                                          year=year)
    artists_year_path = '{dir}artists_{year}_df.pkl'.format(dir=PKL_DIR,
                                                        year=year)
    tracks_year_path = '{dir}tracks_{year}_df.pkl'.format(dir=PKL_DIR,
                                                      year=year)
    dfs[year] = {'tracks': pd.read_pickle(tracks_year_path),
                 'albums': pd.read_pickle(albums_year_path),
                 'artists': pd.read_pickle(artists_year_path)}


In [None]:
if CLEAN_PHASE:
    for year in YEARS:
        tracks = dfs[year]['tracks']
        albums = dfs[year]['tracks']
        artists = dfs[year]['artists']
        
        # New tracks cleaning
        # Transform miliseconds duration to minutes
        tracks['track_duration'] = tracks['track_duration'] / 60000

        # Normalize song_hotness to be a value between 1 and 0
        tracks['song_hotness'] = tracks['song_hotness'] / 100
        
        # Look at genres on tags and add them to track_genres_all
#         for i, row in tracks.iterrows()
#             tags = row['track_tags']
            
#             genres_df['Title'].str.contains()

        # New albums cleaning
#         albums['album_date_released'] = pd.to_datetime( \
#                                                 albums['album_date_released'])

#         # New artist cleaning
#         # Normalize artist_hotness to be a value between 1 and 0
#         artists['artist_hotness'] = artists['artist_hotness'] / 100

    # Merge the datasets of all years to have just one dataset for tracks ,
    # albums and artists
    
    
    

In [None]:
print('Artists size: {}'.format(len(artists_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(artists_df.isna().sum(axis=0)))
artists_df.head()


In [None]:
print('Albums size: {}'.format(len(albums_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(albums_df.isna().sum(axis=0)))
albums_df.head()


In [None]:
print('Echonest size: {}'.format(len(echonest_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(echonest_df.isna().sum(axis=0)))
echonest_df.head()


In [None]:
print('Genres size: {}'.format(len(genres_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(genres_df.isna().sum(axis=0)))
genres_df.head()


In [None]:
print('Tracks size: {}'.format(len(tracks_df)))
if DEBUG: print('\nNaN count by column:\n{}' \
                .format(tracks_df.isna().sum(axis=0)))
tracks_df.head()


#### First we will see the top 10 countries were the music is produced.

In [None]:
#Merge data to have the entire data frame
tracks_echonest = echonest_df.merge(tracks_df, left_on='track_id', 
                                    right_on='track_id', how='right')
tracks_echonest.head()


In [None]:
track_artist = tracks_df.merge(artists_df, left_on='artist_id', 
                               right_on='artist_id')
country_grouped = track_artist.groupby(track_artist['country']).size()
country_top10 = country_grouped.sort_values(ascending=False) \
                                     .head(10)
country_top10.plot(kind='bar', 
                   title="Top 10 countries that produce tracks")
plt.ylabel('Number of Tracks')
plt.grid()


#### Top 10 albums

In [None]:
albums_grouped = albums_df.groupby(albums_df['album_id']) \
                          .first()[['album_title', 'album_listens']]
albums_top10 = albums_grouped.sort_values(by='album_listens', 
                                          ascending=False).head(10)

plt.figure(figsize=(12, 6))
plt.title("Top 10 Albums listened")
plt.grid()
ax = sns.barplot(x='album_title', y= 'album_listens',
                 data=albums_top10)
plt.xticks(rotation=90)


#### Top 10 Tracks

In [None]:
tracks_grouped = tracks_df.groupby(tracks_df['track_id']) \
                          .first()[['track_title', 'track_listens']]
tracks_top10 = tracks_grouped.sort_values(by='track_listens', 
                                          ascending=False).head(10)

albums_top10.plot(x='album_title', kind='bar', title="Top 10 Albums")
plt.grid()
plt.figure(figsize=(12, 6))
plt.title("Top 10 tracks listened")
plt.grid()
ax = sns.barplot(x='track_title', y= 'track_listens',
                 data=tracks_top10)
plt.xticks(rotation=45)


In [None]:

# albums_top10.plot(x='album_title', kind='bar', title="Top 10 Albums")
# plt.grid()
# plt.figure(figsize=(15, 8))
# plt.title("Top 10 tracks listened")
# plt.grid()
# ax = sns.barplot(x='track_title', y= 'track_listens',
#                  data=tracks_top10)
tracks_top10.plot(x='track_title', kind='bar', title='Top 10 tracks listened')

#### Relation between danceability and duration of the song

In [None]:
tracks_df.insert(loc=15, column='track_duration_minutes',
                 value=(tracks_df['track_duration'] / 60))

tracks_df['track_duration_minutes'] = pd.to_numeric(tracks_df['track_duration_minutes'])


In [None]:
# intervals = pd.IntervalIndex.from_arrays([0, 2, 4], 
#                                          [2, 4, 1000], closed='left')

# tracks_df['duration_class'] = pd.cut(tracks_df['track_duration_minutes'], 
#                                      bins=intervals)

tracks_df['track_duration_minutes'] = tracks_df['track_duration_minutes'] \
                                                .apply(lambda x : np.rint(x))
sns.regplot(x='danceability', y='track_duration_minutes',
            data=tracks_df, ci=95, 
            line_kws = {'color': 'green'})


In [None]:
tracks_df.head()


In [None]:
spearman_coeff = stats.spearmanr(tracks_df['danceability'], 
                                 tracks_df['track_duration_minutes'])
spearman_coeff


#### Relation between valence and other variables

Relation between danceability and valence

In [None]:
sns.regplot(x='danceability', y='valence',
            data=tracks_df, ci=95, 
            line_kws = {'color': 'green'})

Relation between energy and valence

In [None]:
sns.regplot(x='energy', y='valence',
            data=tracks_df, ci=95, 
            line_kws = {'color': 'green'})

Relation between track_listens and valence

In [None]:
sns.regplot(x='valence', y='track_listens',
            data=tracks_df, ci=95, 
            line_kws = {'color': 'green'})

Mean value of valence per genre

In [None]:
genre_valence=tracks_df.groupby('track_genre_top', as_index=False)['valence'].mean()
genre_valence.plot(x='track_genre_top', kind='bar', title='Valence per genre')

#### *** End of valence part

In [None]:
track_album = tracks_df.merge(albums_df, left_on='album_id',
                              right_on='album_id')

genre_year = track_album[['track_genre_top', 'album_date_released']]
genre_year.insert(loc=2, column='album_released_year',
                  value=(genre_year['album_date_released'].dt.year))
genre_year.head()


In [None]:
top_genre = genre_year.groupby(genre_year['track_genre_top']).size()

In [None]:
top_genre_sorted = top_genre.sort_values(ascending=False).head(10)
top_genre_sorted.plot(kind='bar', title="Top 10 Genres")
plt.grid()


In [None]:
track_album.sort_values(by='album_date_released', ascending=False)

In [None]:
track_album.sort_values(by='album_date_created')

# As we can see in the tweets notebook there is not enough data in our training data set that relates tweets with spotify, less than 0.017%. Therefore we conclulde that this approach is not feasible.

# Because the previous aproach was not feasible, we decide to work with the music data set and related to an important event of the last decade to analyze if/how the music played a roll on this event.

As we can see on the music dataset, most of it is from artist in the US, and one of the biggest events in the last decade is the election of the United States of 2016 when Donald Trump was elected as president.
With this in mind the new research question is the following:
##### By analyzing the information of the music data set since 2012-2016, find the relation that the music may have had on this event, analyzing the top genre of that period, top tracks, and most important the valence and energy of the songs to try to find out how people of the US fell in that specific time

##### Then the next approach will be to compare this data with information we will obtain trhought the API spotify wit music records since end of 2016, and try to find how people feel on this period. And if posible provide a prediction of what kind of music (genre, energy, valence) people of the US will listen in the following years.

Elections of trump were on november 2016 so we will have pre-trump period with 5 years of music data

In [None]:
pre_trump = track_album[(track_album.album_date_created.dt.year >= 2012) & (track_album.album_date_created.dt.year <= 2016)]

In [None]:
pre_trump.album_date_created

In [None]:
pre_trump

Top 10 tracks

In [None]:
pre_trump_grouped = pre_trump.groupby(tracks_df['track_id']) \
                          .first()[['track_title', 'track_listens']]
pre_trump_tracks_top10 = pre_trump_grouped.sort_values(by='track_listens', 
                                          ascending=False).head(10)


fig, axs = plt.subplots(ncols=2,  constrained_layout=True, figsize=(12,5))

f1=sns.barplot(x='track_title', y= 'track_listens',data=tracks_top10, ax=axs[0])
f1.set_xticklabels(f1.get_xticklabels(), rotation=90)
f1.set_title("Top Tracks")

f2=sns.barplot(x='track_title', y= 'track_listens',data=pre_trump_tracks_top10,  ax=axs[1])
f2.set_xticklabels(f2.get_xticklabels(), rotation=90)
f2.set_title("Pre trump Top Tracks")
fig.tight_layout()
plt.show()


In [None]:
pre_trump_albums_grouped = pre_trump.groupby(pre_trump['album_id']) \
                          .first()[['album_title', 'album_listens']]
pre_trump_albums_top10 = pre_trump_albums_grouped.sort_values(by='album_listens', 
                                          ascending=False).head(10)

fig, axs = plt.subplots(ncols=2,  constrained_layout=True, figsize=(12,5))

f1=sns.barplot(x='album_title', y= 'album_listens',data=albums_top10, ax=axs[0])
f1.set_xticklabels(f1.get_xticklabels(), rotation=90)
f1.set_title("Top Albums")

f2=sns.barplot(x='album_title', y= 'album_listens',data=pre_trump_albums_top10,  ax=axs[1])
f2.set_xticklabels(f2.get_xticklabels(), rotation=90)
f2.set_title("Pre trump Top Albums")
fig.tight_layout()
plt.show()
