# The Show - Genre

## Data to Gather
1. Get top tracks from the Billboard Charts
2. Get genrea for each top tracks using Spptofy's API

## Visulizations
- Number of songs in top 100 by Genre [Stack bar/line graph] per year/decade



## Billdboard Charts
link: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

In [1]:
import pandas as pd
import numpy as np
import os
import kaggle

### Get data from Kaggle
To download csv file using Python, you will need a [Kaggle API Key](https://www.kaggle.com/docs/api). 

Make sure to save the API key in 'C:\Users\USER_NAME.kaggle' directory.

In [2]:
kaggle.api.authenticate()

In [3]:
kaggle.api.dataset_download_files('dhruvildave/billboard-the-hot-100-songs', 'billboard_data', unzip=True)

### Set up directory

In [49]:
ROOT = os.getcwd()
print(f"Root Directory: {ROOT}")

BILLBOARD_DIR = os.path.join(ROOT, 'billboard_data')
print(f"Bilboard Directory: {BILLBOARD_DIR}")

TRACKS_DIR = os.path.join(ROOT, 'spotify_data')
print(f"Spotify Tracks Directory: {TRACKS_DIR}")

Root Directory: C:\Users\Nicholas\Desktop\Masters - Classes\MSDS455\MSDS455-The_Show
Bilboard Directory: C:\Users\Nicholas\Desktop\Masters - Classes\MSDS455\MSDS455-The_Show\billboard_data
Spotify Tracks Directory: C:\Users\Nicholas\Desktop\Masters - Classes\MSDS455\MSDS455-The_Show\spotify_data


### Read into pandas data frame

In [5]:
billboard_df = pd.read_csv(os.path.join(BILLBOARD_DIR, 'charts.csv'))
print(len(billboard_df))
billboard_df.head()

330087


Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,4.0,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,5.0,2,18


### EDA

In [6]:
billboard_df.describe()

Unnamed: 0,rank,last-week,peak-rank,weeks-on-board
count,330087.0,297775.0,330087.0,330087.0
mean,50.500929,47.591631,40.970629,9.161785
std,28.866094,28.05436,29.347481,7.618264
min,1.0,1.0,1.0,1.0
25%,26.0,23.0,13.0,4.0
50%,51.0,47.0,38.0,7.0
75%,76.0,72.0,65.0,13.0
max,100.0,100.0,100.0,90.0


In [7]:
billboard_df.dtypes

date               object
rank                int64
song               object
artist             object
last-week         float64
peak-rank           int64
weeks-on-board      int64
dtype: object

In [8]:
# Check for Nulls
print(billboard_df.isnull().sum())

date                  0
rank                  0
song                  0
artist                0
last-week         32312
peak-rank             0
weeks-on-board        0
dtype: int64


In [9]:
# Unique tracks
print(f"Number of unique tracks: {len(billboard_df['song'].unique())}")

Number of unique tracks: 24620


#### Clean up date column from dtype Object to Datetime and create new column for "Year" and "Month"

In [10]:
billboard_df['date'] = pd.to_datetime(billboard_df['date'])
billboard_df['year'] = billboard_df['date'].dt.year
billboard_df['month'] = billboard_df['date'].dt.month

billboard_df.head()

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board,year,month
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3,2021,11
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16,2021,11
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14,2021,11
3,2021-11-06,4,Fancy Like,Walker Hayes,4.0,3,19,2021,11
4,2021-11-06,5,Bad Habits,Ed Sheeran,5.0,2,18,2021,11


In [11]:
# Look at unique years
sorted(billboard_df['year'].unique())

[1958,
 1959,
 1960,
 1961,
 1962,
 1963,
 1964,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021]

In [13]:
# track name list
track_ls = list(billboard_df['song'].unique())
print(track_ls[:5])

# artist name list
artist_ls = list(billboard_df['artist'].unique())
print(artist_ls[:5])

['Easy On Me', 'Stay', 'Industry Baby', 'Fancy Like', 'Bad Habits']
['Adele', 'The Kid LAROI & Justin Bieber', 'Lil Nas X & Jack Harlow', 'Walker Hayes', 'Ed Sheeran']


## Spotify API
- [Spotipy Documentation](https://spotipy.readthedocs.io/en/2.22.1/)

In [15]:
import json
import requests
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

In [16]:
client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)

In [17]:
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [18]:
artist= artist_ls[0]
track= track_ls[0]

request = sp.search(q='artist:' + artist + ' track:' + track, type='track')

In [19]:
track_id = request['tracks']['items'][0]['id']
print(f"Track ID: {track_id}")

Track ID: 0gplL1WMoJ6iYaPgMCL0gX


In [24]:
request['tracks']['items'][0]['album']['album_type']

'single'

In [25]:
# Album name
album_name = request['tracks']['items'][1]['album']['name']
print(f"Album name: {album_name}")

Album name: 30


In [26]:
# Get audio features
sp.audio_features(track_id)

[{'danceability': 0.604,
  'energy': 0.366,
  'key': 5,
  'loudness': -7.519,
  'mode': 1,
  'speechiness': 0.0282,
  'acousticness': 0.578,
  'instrumentalness': 0,
  'liveness': 0.133,
  'valence': 0.13,
  'tempo': 141.981,
  'type': 'audio_features',
  'id': '0gplL1WMoJ6iYaPgMCL0gX',
  'uri': 'spotify:track:0gplL1WMoJ6iYaPgMCL0gX',
  'track_href': 'https://api.spotify.com/v1/tracks/0gplL1WMoJ6iYaPgMCL0gX',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0gplL1WMoJ6iYaPgMCL0gX',
  'duration_ms': 224695,
  'time_signature': 4}]

In [45]:
track_id_ls = []
album_name_ls = []
audio_features_ls = []

for artist, track in zip(artist_ls, track_ls):
    try:
        request = sp.search(q='artist:' + artist + ' track:' + track, type='track')

        track_id = request['tracks']['items'][0]['id']

        track_id_ls.append(track_id)

        if request['tracks']['items'][0]['album']['album_type'] != 'single':
            album_name_ls.append(request['tracks']['items'][0]['album']['name'])
        else:
            album_name_ls.append(request['tracks']['items'][1]['album']['name'])

        audio_features_ls.append(sp.audio_features(track_id))
    except IndexError: 
        track_id_ls.append("No Spotify Data")
        album_name_ls.append("No Spotify Data")
        audio_features_ls.append("No Spotify Data")

In [46]:
track_id_ls

['0gplL1WMoJ6iYaPgMCL0gX',
 'No Spotify Data',
 'No Spotify Data',
 '58UKC45GPNTflCN6nwCUeF',
 '3rmo8F54jFF8OgYsqTxm5d',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',

In [47]:
album_name_ls

['30',
 'No Spotify Data',
 'No Spotify Data',
 'Country Stuff The Album',
 '=',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',
 'No Spotify Data',

In [43]:
audio_features_ls

[[{'danceability': 0.604,
   'energy': 0.366,
   'key': 5,
   'loudness': -7.519,
   'mode': 1,
   'speechiness': 0.0282,
   'acousticness': 0.578,
   'instrumentalness': 0,
   'liveness': 0.133,
   'valence': 0.13,
   'tempo': 141.981,
   'type': 'audio_features',
   'id': '0gplL1WMoJ6iYaPgMCL0gX',
   'uri': 'spotify:track:0gplL1WMoJ6iYaPgMCL0gX',
   'track_href': 'https://api.spotify.com/v1/tracks/0gplL1WMoJ6iYaPgMCL0gX',
   'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0gplL1WMoJ6iYaPgMCL0gX',
   'duration_ms': 224695,
   'time_signature': 4}],
 'No Spotify Data',
 'No Spotify Data',
 [{'danceability': 0.647,
   'energy': 0.765,
   'key': 1,
   'loudness': -6.459,
   'mode': 1,
   'speechiness': 0.06,
   'acousticness': 0.111,
   'instrumentalness': 0,
   'liveness': 0.315,
   'valence': 0.855,
   'tempo': 79.994,
   'type': 'audio_features',
   'id': '58UKC45GPNTflCN6nwCUeF',
   'uri': 'spotify:track:58UKC45GPNTflCN6nwCUeF',
   'track_href': 'https://api.spotify.com/v1

## Spotify Data (Take 2)

In [48]:
kaggle.api.dataset_download_files('lehaknarnauli/spotify-datasets', 'spotify_data', unzip=True)

In [54]:
tracks_df = pd.read_csv(os.path.join(TRACKS_DIR, 'tracks.csv'))
print(len(tracks_df))
tracks_df.head()

586672


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [55]:
tracks_df2 = tracks_df.rename(columns={"name": "song"})

In [66]:
billboard_df2 = billboard_df.merge(tracks_df2, how='left', on='song')

In [77]:
billboard_df2 = billboard_df2.drop_duplicates(subset=['song', 'artist'], keep=False)

In [78]:
len(billboard_df2['id'])

1716

In [79]:
billboard_df2['id'].notna().sum()

398

## Metacritic Webscrape
URL: https://www.metacritic.com/

**EXAMPLE**

![example1](images/metacritic1.png)

In [None]:
from bs4 import BeautifulSoup

user_agent = {'User-agent': 'Mozilla/5.0'}

url = f"https://www.metacritic.com/music/{album_name}/{artist}"

r = requests.get(search_url, headers = user_agent)
soup = BeautifulSoup(r.text,"html.parser") 

In [None]:
review_data = soup.find_all('div', class_='metascore_w xlarge album positive')
review_data

In [None]:
rating = review_data[0].find_all('span')[0].text

print(f"Album rating from metacritic: {rating}")