In [209]:
import pandas as pd
import numpy as np

import urllib.request
from bs4 import BeautifulSoup

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

* Music....

But there's also something deeper and less obvious to this topic. Music and songs are something we perceive relatively consciously. It is just a tip of the iceberg of wider soundscape we live in. Companies and marketers devote big amounts of their attention to visual representations of their products. But this space becomes more and more cluttered. Better understanding of audio space can help setting the new directions in marketing. 

Machine learning analysis of music could be even more useful when combined with thoughtful experiments on sound pereception using psychophysiological monitoring techinques like EEG, EKG or GSR.

## Define global params

In [3]:
# Define headers
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

In [4]:
# Spotify IDs / keys
client_id='fa4bd213f2b248af8fd1871cceeaafc3'
client_secret='2e5fc08ff47e441e8353304194075de2'

## Create dataframe

In [223]:
# Create an empty dataframe
data = pd.DataFrame(columns=['artist', 'title', 'key', 'mode', 'tempo', 'time_sign',\
                           'speechiness', 'valence', 'liveness',\
                           'danceability', 'loudness', 'duration', 'year'])

In [224]:
data

Unnamed: 0,artist,title,key,mode,tempo,time_sign,speechiness,valence,liveness,danceability,loudness,duration,year


## Define functions

In [101]:
def generate_url(year):
    url = f'https://www.billboard.com/archive/charts/{year}/hot-100'
    return url

In [89]:
def scrape_billboard(url, hdr):
    
    yearly_top = []
    
    request = urllib.request.Request(url, headers=hdr)
    with urllib.request.urlopen(request) as req:
        page = req.read()
    
    billboard_soup = BeautifulSoup(page, 'lxml')            # Prepare some soup :)
     
    table = billboard_soup.select("table.archive-table")    # Select the right table
    
    for table in billboard_soup.find_all('table'):          # Get info 
        for row in table.findAll('tr'):
            if len(row) == 7:
                tds = row.find_all('td')
                if len(tds) > 0:
                    yearly_top.append((f'{tds[2].string} {tds[1].string}'))
    return yearly_top

In [158]:
def edit_title(song_title):
    new_split = []
    if ('feat' in song_title.lower()) or ('&' in song_title) or ('"' in song_title) or\
    ('(' in song_title):
        split_title = song_title.lower().split(' ')
        for word in split_title:
            if ('feat' not in word) and ('&' not in word) and ('"' not in word) and\
            ('(' not in word) and (')' not in word):
                new_split.append(word)
        song_title = ' '.join(new_split)
    return song_title.lower()

In [155]:
def get_metadata(song_title, client_id, client_secret):
    """
    Tries finding metadata through Spotify
    """

    client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)

    spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    results = spotify.search(song_title, limit=1)
    
    try:
        song_id = results['tracks']['items'][0]['id']
        results = results['tracks']['items'][0]  # Find top result
        album = results['album']['name']  # Parse json dictionary
        artist = results['album']['artists'][0]['name']
        song_title = results['name']
    except IndexError:
        print(f'Song "{song_title}" not found - try manual search')
        return 0, 0, 0

    return artist, song_title, song_id

In [113]:
def get_feats(song_id, client_id, client_secret):

    client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)

    spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    results = spotify.audio_features(song_id)

#     song_id = results['tracks']['items'][0]['id']
#     results = results['tracks']['items'][0]  # Find top result
#     album = results['album']['name']  # Parse json dictionary
#     artist = results['album']['artists'][0]['name']
#     song_title = results['name']

    return results

In [114]:
get_feats('0AwC0oZxr25SOcThearmN4', client_id=client_id, client_secret=client_secret)

[{'acousticness': 0.0135,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0AwC0oZxr25SOcThearmN4',
  'danceability': 0.927,
  'duration_ms': 300383,
  'energy': 0.705,
  'id': '0AwC0oZxr25SOcThearmN4',
  'instrumentalness': 0,
  'key': 7,
  'liveness': 0.0755,
  'loudness': -6.27,
  'mode': 1,
  'speechiness': 0.0577,
  'tempo': 99.024,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/0AwC0oZxr25SOcThearmN4',
  'type': 'audio_features',
  'uri': 'spotify:track:0AwC0oZxr25SOcThearmN4',
  'valence': 0.749}]

In [103]:
get_metadata('legend', client_id=client_id, client_secret=client_secret)

('Drake', 'Legend', '1ID1QFSNNxi0hiZCNcwjUC')

In [176]:
def decode_key(key):
    keys = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
    return keys[key]

def decode_mode(mode):
    modes = ['minor', 'major']
    return modes[mode]

In [105]:
class Song:
    
    def __init__(self, artist, title, year, sptfy_id, audio_feats):
        self.artist = artist
        self.title = title
        self.year = year
        self.sptfy_id = sptfy_id
        self.audio_feats = audio_feats

In [151]:
def get_entries(years):
    song_info = []
    for year in years:
        url = generate_url(year)
        songs = scrape_billboard(url, hdr)
        to_search = []
        for song in songs:
            to_search.append(edit_title(song))
        for search_phrase in to_search:
            artist, title, sptfy_id =\
            get_metadata(search_phrase, client_id=client_id, client_secret=client_secret)
            if sptfy_id == 0:
                pass
            else:
                feats = get_feats(sptfy_id, client_id=client_id, client_secret=client_secret)
                song_info.append(Song(artist, title, year, sptfy_id, feats))
            
    return song_info

In [154]:
entries_80_89 = get_entries(range(1980,1990))

Song paul mccartney and wings coming up (live at glasgow) not found - try manual search
Song kenny rogers duet with dolly parton islands in the stream not found - try manual search
Song peter cetera glory of love (theme from "the karate kid part ii") not found - try manual search
Song bob seger shakedown (from "beverly hills cop ii") not found - try manual search
Song michael jackson with siedah garrett i just can't stop loving you not found - try manual search
Song bobby mcferrin don't worry, be happy (from "cocktail") not found - try manual search
retrying ...1secs
Song michael damian rock on (from "dream a little dream") not found - try manual search


In [159]:
entries_90_99 = get_entries(range(1990,2000))

Song "paula abdul with the wild opposites attract" not found - try manual search
Song "2pac k-ci and jojo how do u want it/california love" not found - try manual search
Song "toni braxton you're makin' me high/let it flow" not found - try manual search


In [160]:
entries_00_09 = get_entries(range(2000,2010))

In [161]:
entries_10_18 = get_entries(range(2010,2019))

Song "far*east movement cataracs dev like a g6" not found - try manual search
Song "far*east movement cataracs dev like a g6" not found - try manual search
retrying ...1secs


In [163]:
get_metadata('like a g6 far east', client_id=client_id, client_secret=client_secret)

('Far East Movement', 'Like A G6', '4DvhkX2ic4zWkQeWMwQ2qf')

In [227]:
def insert_entries(entries, dataframe):
    id_last = data.shape[0]
    for i, entry in enumerate(entries):
        dataframe.loc[id_last + i] = [entry.artist, entry.title, decode_key(entry.audio_feats[0]['key']),\
             decode_mode(entry.audio_feats[0]['mode']), entry.audio_feats[0]['tempo'],\
             entry.audio_feats[0]['time_signature'], entry.audio_feats[0]['speechiness'],\
             entry.audio_feats[0]['valence'], entry.audio_feats[0]['danceability'], \
             entry.audio_feats[0]['danceability'], entry.audio_feats[0]['loudness'], \
             entry.audio_feats[0]['duration_ms'], entry.year]

In [228]:
all_entries = [entries_80_89, entries_90_99, entries_00_09, entries_10_18]

In [229]:
for i in all_entries:
    insert_entries(i, data)

In [233]:
data.to_csv(r'C:\Users\Ol\Documents\DATA ANALYSIS\DATA_INCUBATOR\music_data.csv')