In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.ticker as ticker
from IPython.display import Audio
from sklearn import datasets, linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

sns.set()
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)

In [2]:
#Billboard Top 100 By Week from 1958 to 2021
bb100 = pd.read_csv('Hot Stuff.csv')

In [3]:
#First & Last 2 entries by Week/Year
bb100['WeekID'] = pd.DatetimeIndex(bb100['WeekID'])
bb100.sort_values(by='WeekID').iloc[np.r_[0:2, -2:0]]

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart
18553,http://www.billboard.com/charts/hot-100/1958-08-02,1958-08-02,63,High School Confidential,Jerry Lee Lewis And His Pumping Piano,High School ConfidentialJerry Lee Lewis And His Pumping Piano,1,,63,1
103337,http://www.billboard.com/charts/hot-100/1958-08-02,1958-08-02,98,Little Serenade,The Ames Brothers,Little SerenadeThe Ames Brothers,1,,98,1
300806,https://www.billboard.com/charts/hot-100/2021-05-29,2021-05-29,61,Almost Maybes,Jordan Davis,Almost MaybesJordan Davis,2,64.0,61,17
152154,https://www.billboard.com/charts/hot-100/2021-05-29,2021-05-29,78,White Teeth,YoungBoy Never Broke Again,White TeethYoungBoy Never Broke Again,1,,78,1


In [4]:
#Split the Dataframes by Decade from 60s, 70s, 80s, 90s, 2000s, 2010s
bb60s = bb100[(bb100['WeekID'] >= '1960-1-1') & (bb100['WeekID'] < '1970-1-1')]
bb70s = bb100[(bb100['WeekID'] >= '1970-1-1') & (bb100['WeekID'] < '1980-1-1')]
bb80s = bb100[(bb100['WeekID'] >= '1980-1-1') & (bb100['WeekID'] < '1990-1-1')]
bb90s = bb100[(bb100['WeekID'] >= '1990-1-1') & (bb100['WeekID'] < '2000-1-1')]
bb2000s = bb100[(bb100['WeekID'] >= '2000-1-1') & (bb100['WeekID'] < '2010-1-1')]
bb2010s = bb100[(bb100['WeekID'] >= '2010-1-1') & (bb100['WeekID'] < '2020-1-1')]

In [5]:
#Add Decade Column to Dataframe:
def decade_to_df(df, decade_as_string):
    return df.assign(Decade=decade_as_string)

In [6]:
#Execute the decade_to_df function on split dataframes:
bb60s = decade_to_df(bb60s, '1960s')
bb70s = decade_to_df(bb70s, '1970s')
bb80s = decade_to_df(bb80s, '1980s')
bb90s = decade_to_df(bb90s, '1990s')
bb2000s = decade_to_df(bb2000s, '2000s')
bb2010s = decade_to_df(bb2010s, '2010s')

In [7]:
#Check results of function
bb2010s.sort_values(by='WeekID').iloc[np.r_[0:2, -2:0]]

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Decade
45759,http://www.billboard.com/charts/hot-100/2010-01-02,2010-01-02,84,On Fire,Lil Wayne,On FireLil Wayne,1,62.0,62,2,2010s
242592,http://www.billboard.com/charts/hot-100/2010-01-02,2010-01-02,26,Money To Blow,Birdman Featuring Lil Wayne & Drake,Money To BlowBirdman Featuring Lil Wayne & Drake,1,26.0,26,11,2010s
169774,https://www.billboard.com/charts/hot-100/2019-12-28,2019-12-28,53,Like It's Christmas,Jonas Brothers,Like It's ChristmasJonas Brothers,1,61.0,53,3,2010s
197357,https://www.billboard.com/charts/hot-100/2019-12-28,2019-12-28,35,Woah,Lil Baby,WoahLil Baby,1,28.0,16,6,2010s


In [8]:
#Stack all dataframes on top of each other:
bb_all_decade = pd.concat([bb60s, bb70s, bb80s, bb90s, bb2000s, bb2010s])

In [9]:
#Extract all weeks with #1 Song
bb1_w_decade = bb_all_decade[bb_all_decade['Week Position']==1].copy()

In [10]:
#Check the dataframe
bb1_w_decade.sort_values(by='WeekID').iloc[np.r_[0:2, -2:0]]

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Decade
226796,http://www.billboard.com/charts/hot-100/1960-01-02,1960-01-02,1,El Paso,Marty Robbins,El PasoMarty Robbins,1,2.0,1,9,1960s
232846,http://www.billboard.com/charts/hot-100/1960-01-09,1960-01-09,1,El Paso,Marty Robbins,El PasoMarty Robbins,1,1.0,1,10,1960s
304046,https://www.billboard.com/charts/hot-100/2019-12-21,2019-12-21,1,All I Want For Christmas Is You,Mariah Carey,All I Want For Christmas Is YouMariah Carey,9,3.0,1,34,2010s
304047,https://www.billboard.com/charts/hot-100/2019-12-28,2019-12-28,1,All I Want For Christmas Is You,Mariah Carey,All I Want For Christmas Is YouMariah Carey,9,1.0,1,35,2010s


In [11]:
#Check for Null Values By Column: 
bb1_w_decade.isnull().sum()

url                        0
WeekID                     0
Week Position              0
Song                       0
Performer                  0
SongID                     0
Instance                   0
Previous Week Position    36
Peak Position              0
Weeks on Chart             0
Decade                     0
dtype: int64

In [12]:
#Check Dimensions
bb1_w_decade.shape

(3131, 11)

In [13]:
#Features
features = pd.read_excel('Hot 100 Audio Features.xlsx')

In [14]:
#Merge bb1_w_decade & features: 
decade_features = pd.merge(bb1_w_decade, features.drop_duplicates(subset=['SongID']), on='SongID', how='left')
decade_features.head(3)

Unnamed: 0,url,WeekID,Week Position,Song_x,Performer_x,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,Decade,Performer_y,Song_y,spotify_genre,spotify_track_id,spotify_track_preview_url,spotify_track_duration_ms,spotify_track_explicit,spotify_track_album,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,http://www.billboard.com/charts/hot-100/1962-12-22,1962-12-22,1,Telstar,The Tornadoes,TelstarThe Tornadoes,1,5.0,1,8,1960s,The Tornadoes,Telstar,['surf music'],5NIb0uP4CO3ckfyCIjjcFx,https://p.scdn.co/mp3-preview/b699d5122d2a509902ece8b17b5e03ebe9827b71?cid=b8d3901151d34489a160e3cf0ab1fa94,198960.0,0.0,Mad Men (Music from the Original TV Series,0.289,0.728,6.0,-6.495,0.0,0.0385,0.000368,0.929,0.381,0.539,143.988,4.0,8.0
1,http://www.billboard.com/charts/hot-100/1962-12-29,1962-12-29,1,Telstar,The Tornadoes,TelstarThe Tornadoes,1,1.0,1,9,1960s,The Tornadoes,Telstar,['surf music'],5NIb0uP4CO3ckfyCIjjcFx,https://p.scdn.co/mp3-preview/b699d5122d2a509902ece8b17b5e03ebe9827b71?cid=b8d3901151d34489a160e3cf0ab1fa94,198960.0,0.0,Mad Men (Music from the Original TV Series,0.289,0.728,6.0,-6.495,0.0,0.0385,0.000368,0.929,0.381,0.539,143.988,4.0,8.0
2,http://www.billboard.com/charts/hot-100/1963-01-05,1963-01-05,1,Telstar,The Tornadoes,TelstarThe Tornadoes,1,1.0,1,10,1960s,The Tornadoes,Telstar,['surf music'],5NIb0uP4CO3ckfyCIjjcFx,https://p.scdn.co/mp3-preview/b699d5122d2a509902ece8b17b5e03ebe9827b71?cid=b8d3901151d34489a160e3cf0ab1fa94,198960.0,0.0,Mad Men (Music from the Original TV Series,0.289,0.728,6.0,-6.495,0.0,0.0385,0.000368,0.929,0.381,0.539,143.988,4.0,8.0


In [15]:
decade_features.shape

(3131, 32)

In [16]:
decade_features.isnull().sum()

url                             0
WeekID                          0
Week Position                   0
Song_x                          0
Performer_x                     0
SongID                          0
Instance                        0
Previous Week Position         36
Peak Position                   0
Weeks on Chart                  0
Decade                          0
Performer_y                     0
Song_y                          0
spotify_genre                 160
spotify_track_id              337
spotify_track_preview_url    1563
spotify_track_duration_ms     337
spotify_track_explicit        337
spotify_track_album           337
danceability                  337
energy                        337
key                           337
loudness                      337
mode                          337
speechiness                   337
acousticness                  337
instrumentalness              337
liveness                      337
valence                       337
tempo         

In [17]:
#Drop Null Values as Preprocessing Step
no_null_list = ["energy", "liveness", "tempo", "valence", "loudness", "speechiness", "acousticness", "danceability", "instrumentalness"]
decade_features = decade_features.dropna(subset=no_null_list)
print(len(decade_features))
decade_features.isnull().sum()

2794


url                             0
WeekID                          0
Week Position                   0
Song_x                          0
Performer_x                     0
SongID                          0
Instance                        0
Previous Week Position         33
Peak Position                   0
Weeks on Chart                  0
Decade                          0
Performer_y                     0
Song_y                          0
spotify_genre                  14
spotify_track_id                0
spotify_track_preview_url    1226
spotify_track_duration_ms       0
spotify_track_explicit          0
spotify_track_album             0
danceability                    0
energy                          0
key                             0
loudness                        0
mode                            0
speechiness                     0
acousticness                    0
instrumentalness                0
liveness                        0
valence                         0
tempo         

In [21]:

keep = ["energy", "liveness", "tempo", "valence", "loudness", 
        "speechiness", "acousticness", "danceability", "instrumentalness"]

X = decade_features[keep].values
decade = decade_features.Decade
preview = decade_features.spotify_track_preview_url
popularity = decade_features.spotify_track_popularity

min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)

pca = PCA(n_components=3)
pca.fit(X)

X = pca.transform(X)

three_D_df = pd.DataFrame({'Decade':decade, 'col_x':X[:,0], 
                           'col_y':X[:,1], 'col_z':X[:,2],
                           'popularity':popularity, 
                           'link':preview})


three_D = px.scatter_3d(three_D_df, x='col_x', y='col_y', z='col_z',
                       color='Decade', size='popularity',
                       template='plotly_dark')

three_D.show()


In [19]:
#1. What does all this clustering actually mean? 