In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
from tqdm import tqdm
from datetime import datetime
import json
from sklearn.preprocessing import LabelEncoder
import calendar
import spotipy
warnings.filterwarnings("ignore")

# Data Importing & Engineering

In [None]:
billboard_weekly_df = pd.read_csv("~/desktop/spotify_top_tracks_analysis/data/billboard_with_features_for_10000_samples.csv")

In [None]:
billboard_weekly_df.shape

In [None]:
billboard_weekly_df.columns #had an error naming "avg_beat_conf" as "avg_bar_conf.1"

In [None]:
# replace the column with correct name:
billboard_weekly_df.rename(columns={'avg_bar_conf.1':'avg_beat_conf'}, inplace=True)

In [None]:
billboard_weekly_df.head()

In [None]:
# map lambda example

# results = map(lambda x: x + 1, [1,2,3])
# print(list(results))

In [None]:
#add a column of datetime objects with date from "WeekID"
billboard_weekly_df['Date'] = pd.to_datetime(billboard_weekly_df['WeekID'])

#add a column of Year
billboard_weekly_df['Year'] = billboard_weekly_df['Date'].apply(lambda x: x.year)

#add a column of Month
billboard_weekly_df['Month'] = billboard_weekly_df['Date'].apply(lambda x: x.month)

#add a column of Year and Month
billboard_weekly_df['Year_Month'] = billboard_weekly_df['Date'].apply(lambda x: (x.year,x.month))


#drop the columns that will not be used
billboard_weekly_df.columns[[0,1,2,3,4,5]] #these are the 6 columns we are not interested in
billboard_weekly_df = billboard_weekly_df.drop(billboard_weekly_df.columns[:6], axis=1) #axis=1: columns

In [None]:
billboard_weekly_df.shape

In [None]:
billboard_weekly_df.head()

# Exploring Variables

## Looking at the columns we are interested in

In [None]:
billboard_weekly_df.columns

These are the variables we want to look at:

In [None]:
interested_columns_index = list(pd.concat([billboard_weekly_df.ix[:, 'Week Position'],
                                billboard_weekly_df.ix[:, 'Instance':'Weeks on Chart'],
                                billboard_weekly_df.ix[:, 'duration_ms':]],axis=1).columns.values)
interested_columns_index

In [None]:
billboard_weekly_df.describe().columns

compared to the variables from *.concat* we are interested in, the variables from *.describe* which includes all numerical variables does not have the following 8 non-numerical variables:
- 'key' , 'mode' , 'time_signature'
- 'key.1', 'mode.1', 'time_signature.1'
- 'Date', Year_Month'

In [None]:
len(pd.concat([billboard_weekly_df.ix[:,0],
           billboard_weekly_df.ix[:,4:8],
           billboard_weekly_df.ix[:,9:]],axis=1).columns)
#OR
len(pd.concat([billboard_weekly_df.ix[:, 'Week Position'],
           billboard_weekly_df.ix[:, 'Instance':'Weeks on Chart'],
           billboard_weekly_df.ix[:, 'duration_ms':]],axis=1).columns)
#OR
len(interested_columns_index)

In [None]:
len(billboard_weekly_df.describe().columns)

## Correlation Plots and Tables

In [None]:
len(billboard_weekly_df.corr()) #correlation plot only includes the numerical variables

In [None]:
corr_columns_index = list(billboard_weekly_df.describe().columns.values)

#1 

In [None]:
from matplotlib.colors import ListedColormap
from matplotlib import cm as cm

cmap = cm.get_cmap('cool', 30) #or 'jet' or 'hot'
# matshow parameter: cax = plt.matshow(x,cmap=cmap)

# plt.matshow(billboard_weekly_df.corr())
# plt.xticks(range(billboard_weekly_df.shape[1]), billboard_weekly_df.columns)
# plt.yticks(range(billboard_weekly_df.shape[1]), billboard_weekly_df.columns)
# plt.colorbar()
# plt.show()
len(billboard_weekly_df.columns) == billboard_weekly_df.shape[1] #TRUE

f = plt.figure(figsize=(15, 15))
plt.matshow(billboard_weekly_df.corr(), fignum=f.number, cmap=cmap) #cmap only used after setting colors before
plt.xticks(range(len(corr_columns_index)), billboard_weekly_df[corr_columns_index], fontsize=10, rotation=90)
plt.yticks(range(len(corr_columns_index)), billboard_weekly_df[corr_columns_index], fontsize=10)
#x labels shown on both the bottom and the top:
plt.tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=True)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=12) #size of colorbar labels
plt.title('Correlation Matrix', fontsize=20, y=1.23); #y sets the title position vertically

#2

In [None]:
from matplotlib.colors import ListedColormap
from matplotlib import cm as cm

cmap = ListedColormap(['c', 'b', 'w', 'r'])
#other colors: 'b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'
#cmap=sns.diverging_palette(220, 10, as_cmap=True)

fig = plt.figure(figsize=(15, 15))
ax1 = fig.add_subplot(111)
cax = ax1.imshow(billboard_weekly_df[corr_columns_index].corr(), interpolation="nearest", cmap=cmap)
ax1.grid(True)
plt.title('Correlation Matrix',fontsize=20)
labels=corr_columns_index
ax1.set_xticks(np.arange(len(labels))) #display all the x labels
ax1.set_yticks(np.arange(len(labels))) #display all the y labels
ax1.set_xticklabels(labels,fontsize=10,rotation=90)
ax1.set_yticklabels(labels,fontsize=10)
# Add colorbar, make sure to specify tick locations to match desired ticklabels
fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
plt.show()

#3 Colored table

In [None]:
# import pandas.rpy.common as com
# import seaborn as sns
# %matplotlib inline

# # load the R package ISLR
# infert = com.importr("ISLR")

# calculate the correlation matrix
#corr = billboard_weekly_df.iloc[:,10:].corr()
corr = billboard_weekly_df[corr_columns_index].corr()

# plot the heatmap
# sns.heatmap(corr, 
#         xticklabels=corr.columns,
#         yticklabels=corr.columns)

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True) #color: 5, 250

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

# Features EDA: 
'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo'

In [None]:
#acousticness by month (test)
plt.rcParams['figure.figsize'] = [150, 60]

avg_acousticness_by_month = billboard_weekly_df.groupby('Year_Month')['acousticness'].mean()
avg_acousticness_by_month
avg_acousticness_by_month_plot = sns.barplot(x=avg_acousticness_by_month.index, y=avg_acousticness_by_month.values)
avg_acousticness_by_month_plot.set_xticklabels(labels=avg_acousticness_by_month.index, rotation=90)
avg_acousticness_by_month_plot.set(xlabel='Month', ylabel='Average Acousticness', title='Average Acousticness by Month, 1958-2019')
plt.show()

In [None]:
#acousticness
plt.rcParams['figure.figsize'] = [20, 8]

avg_acousticness_by_year = billboard_weekly_df.groupby('Year')['acousticness'].mean()
avg_acousticness_by_year
avg_acousticness_by_year_plot = sns.barplot(x=avg_acousticness_by_year.index, y=avg_acousticness_by_year.values)
avg_acousticness_by_year_plot.set_xticklabels(labels=avg_acousticness_by_year.index, rotation=90)
avg_acousticness_by_year_plot.set(xlabel='Year', ylabel='Average Acousticness', title='Average Acousticness by Year, 1958-2019')
plt.show()

In [None]:
#danceability
#plt.rcParams['figure.figsize'] = [20, 8]

avg_danceability_by_year = billboard_weekly_df.groupby('Year')['danceability'].mean()
avg_danceability_by_year
avg_danceability_by_year_plot = sns.barplot(x=avg_danceability_by_year.index, y=avg_danceability_by_year.values)
avg_danceability_by_year_plot.set_xticklabels(labels=avg_danceability_by_year.index, rotation=90)
avg_danceability_by_year_plot.set(xlabel='Year', ylabel='Average Danceability', title='Average Danceability by Year, 1958-2019')
plt.show()

In [None]:
#energy
#plt.rcParams['figure.figsize'] = [20, 8]

avg_energy_by_year = billboard_weekly_df.groupby('Year')['energy'].mean()
avg_energy_by_year
avg_energy_by_year_plot = sns.barplot(x=avg_energy_by_year.index, y=avg_energy_by_year.values)
avg_energy_by_year_plot.set_xticklabels(labels=avg_energy_by_year.index, rotation=90)
avg_energy_by_year_plot.set(xlabel='Year', ylabel='Average Energy', title='Average Energy by Year, 1958-2019')
plt.show()

In [None]:
#instrumentalness
#plt.rcParams['figure.figsize'] = [20, 8]

avg_instrumentalness_by_year = billboard_weekly_df.groupby('Year')['instrumentalness'].mean()
avg_instrumentalness_by_year
avg_instrumentalness_by_year_plot = sns.barplot(x=avg_instrumentalness_by_year.index, y=avg_instrumentalness_by_year.values)
avg_instrumentalness_by_year_plot.set_xticklabels(labels=avg_instrumentalness_by_year.index, rotation=90)
avg_instrumentalness_by_year_plot.set(xlabel='Year', ylabel='Average Instrumentalness', title='Average Instrumentalness by Year, 1958-2019')
plt.show()

In [None]:
#liveness
#plt.rcParams['figure.figsize'] = [20, 8]

avg_liveness_by_year = billboard_weekly_df.groupby('Year')['liveness'].mean()
avg_liveness_by_year
avg_liveness_by_year_plot = sns.barplot(x=avg_liveness_by_year.index, y=avg_liveness_by_year.values)
avg_liveness_by_year_plot.set_xticklabels(labels=avg_liveness_by_year.index, rotation=90)
avg_liveness_by_year_plot.set(xlabel='Year', ylabel='Average Liveness', title='Average Liveness by Year, 1958-2019')
plt.show()

In [None]:
#loudness
#plt.rcParams['figure.figsize'] = [20, 8]

avg_loudness_by_year = billboard_weekly_df.groupby('Year')['loudness'].mean()
avg_loudness_by_year
avg_loudness_by_year_plot = sns.barplot(x=avg_loudness_by_year.index, y=avg_loudness_by_year.values)
avg_loudness_by_year_plot.set_xticklabels(labels=avg_loudness_by_year.index, rotation=90)
avg_loudness_by_year_plot.set(xlabel='Year', ylabel='Average Loudness', title='Average Loudness by Year, 1958-2019')
plt.show()

In [None]:
#speechiness
#plt.rcParams['figure.figsize'] = [20, 8]

avg_speechiness_by_year = billboard_weekly_df.groupby('Year')['speechiness'].mean()
avg_speechiness_by_year
avg_speechiness_by_year_plot = sns.barplot(x=avg_speechiness_by_year.index, y=avg_speechiness_by_year.values)
avg_speechiness_by_year_plot.set_xticklabels(labels=avg_speechiness_by_year.index, rotation=90)
avg_speechiness_by_year_plot.set(xlabel='Year', ylabel='Average Speechiness', title='Average Speechiness by Year, 1958-2019')
plt.show()

In [None]:
#valence
#plt.rcParams['figure.figsize'] = [20, 8]

avg_valence_by_year = billboard_weekly_df.groupby('Year')['valence'].mean()
avg_valence_by_year
avg_valence_by_year_plot = sns.barplot(x=avg_valence_by_year.index, y=avg_valence_by_year.values)
avg_valence_by_year_plot.set_xticklabels(labels=avg_valence_by_year.index, rotation=90)
avg_valence_by_year_plot.set(xlabel='Year', ylabel='Average Valence', title='Average Valence by Year, 1958-2019')
plt.show()

In [None]:
#tempo
#plt.rcParams['figure.figsize'] = [20, 8]

avg_tempo_by_year = billboard_weekly_df.groupby('Year')['tempo'].mean()
avg_tempo_by_year
avg_tempo_by_year_plot = sns.barplot(x=avg_tempo_by_year.index, y=avg_tempo_by_year.values)
avg_tempo_by_year_plot.set_xticklabels(labels=avg_tempo_by_year.index, rotation=90)
avg_tempo_by_year_plot.set(xlabel='Year', ylabel='Average Tempo', title='Average Tempo by Year, 1958-2019')
plt.show()

In [None]:
sns.boxplot(x=billboard_weekly_df["Year"], y=billboard_weekly_df["avg_tempo"] )

In [None]:
#duration_ms
#plt.rcParams['figure.figsize'] = [20, 8]

avg_duration_by_year = billboard_weekly_df.groupby('Year')['duration_ms'].mean()
avg_duration_by_year
avg_duration_by_year_plot = sns.barplot(x=avg_duration_by_year.index, y=avg_duration_by_year.values)
avg_duration_by_year_plot.set_xticklabels(labels=avg_duration_by_year.index, rotation=90)
avg_duration_by_year_plot.set(xlabel='Year', ylabel='Average Duration', title='Average Duration by Year, 1958-2019')
plt.show()

In [None]:
sns.boxplot(x=billboard_weekly_df["Year"], y=billboard_weekly_df["duration_ms"] )

In [None]:
#num_sections
#plt.rcParams['figure.figsize'] = [20, 8]

avg_num_sections_by_year = billboard_weekly_df.groupby('Year')['num_sections'].mean()
avg_num_sections_by_year
avg_num_sections_by_year_plot = sns.barplot(x=avg_num_sections_by_year.index, y=avg_num_sections_by_year.values)
avg_num_sections_by_year_plot.set_xticklabels(labels=avg_num_sections_by_year.index, rotation=90)
avg_num_sections_by_year_plot.set(xlabel='Year', ylabel='Average Number of Sections', title='Average Num of Sections by Year, 1958-2019')
plt.show()

In [None]:
billboard_weekly_df['num_sections'].corr(billboard_weekly_df['duration_ms'])

In [None]:
billboard_weekly_df[['num_sections', 'duration_ms', 'key', 'mode', 'time_signature', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'valence', 'tempo']].corr()