# Lab | Web Scraping Single Page (GNOD part 1)

In [9]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import json_normalize

In [2]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
response = requests.get(url)

In [4]:
response.status_code

200

In [5]:
soup = BeautifulSoup(response.content, "html.parser")

We try to find the correct path in the web.

In [6]:
# body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper

In [7]:
# soup.select('body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper')

In [8]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > cite

In [9]:
soup.select('#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > cite')

[<cite class="title">Margaritaville</cite>]

We try with '.title' and it works (thanks to Lilit). It seems like only 'cite' tags have the attribute 'title'.

In [10]:
songs = soup.select('.title')
artists = soup.select('.artist')

In [11]:
# songs

In [12]:
# artists

We get the text from each tag and store them in lists that we then transform into a dataframe.

In [13]:
titles = []
for cite in songs:
    titles.append(cite.get_text())

In [14]:
singers = []
for cite in artists:
    singers.append(cite.get_text())

In [15]:
top100_20230905 = pd.DataFrame({"title":titles,
                       "artist":singers,
                      })

In [16]:
top100_20230905

Unnamed: 0,title,artist
0,Margaritaville,Jimmy Buffett
1,Come Monday,Jimmy Buffett
2,Rich Men North of Richmond,Oliver Anthony Music
3,All Star,Smash Mouth
4,Cheeseburger In Paradise,Jimmy Buffett
...,...,...
95,Thought You Should Know,Morgan Wallen
96,Dial Drunk,Noah Kahan & Post Malone
97,bad idea right?,Olivia Rodrigo
98,august,Taylor Swift


In [17]:
top100_20230905.to_csv('top100_popvortex_20230905.csv')

# Lab | Web Scraping Multiple Pages

Expand the project
If you're done, you can try to expand the project on your own. Here are a few suggestions:
- Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
- Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.

In [18]:
url = "https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Frank_Sinatra"

In [19]:
response = requests.get(url)

In [20]:
response.status_code

200

In [21]:
soup = BeautifulSoup(response.content, "html.parser")

First, we see the data is inside the second table and we find the path inside the table.

In [22]:
table = soup.select("table")[1]

In [23]:
# table.select('tbody > tr > td > a')

We loop through the list to obtain the names and then store the new list in a dataframe.

In [24]:
songs_sinatra = []
for e in table.select('tbody > tr > td > a'):
    song = e.get_text()
    if song is not None:
        songs_sinatra.append(e.get_text())

In [25]:
len(songs_sinatra)

2356

In [26]:
songs_sinatra = pd.DataFrame({'Sinatra_song': songs_sinatra})

In [27]:
songs_sinatra

Unnamed: 0,Sinatra_song
0,Ac-cent-tchu-ate the Positive
1,Harold Arlen
2,Johnny Mercer
3,Johnny Burke
4,Jimmy Van Heusen
...,...
2351,Ludwig Herzer
2352,Franz Lehár
2353,Fritz Löhner-Beda
2354,Zing! Went the Strings of My Heart


# GNOD Process Step 2.

In [28]:
hot = pd.read_csv('top100_popvortex_20230905.csv')

In [29]:
hot.dtypes

Unnamed: 0     int64
title         object
artist        object
dtype: object

We ask the user to input a song title.

In [30]:
input_song = input('Enter a song title: ')

In [31]:
input_song

'Margaritaville'

In [32]:
type(input_song)

str

In [33]:
hot['title'][0]

'Margaritaville'

If the title matches a song in our list, we suggest another song from our list.

In [39]:
from random import randint

if input_song in hot['title'].values:
    ran = randint(0,len(hot))
    print('You could try: ' + hot['title'][ran] + ' by ' + hot['artist'][ran])
else:
    print('No recommendation at this time')

You could try: White Horse by Chris Stapleton


# Lab | API wrappers - Create your collection of songs & audio features.

#### Authentication.

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
secrets_file = open("Spotify.txt","r")

In [3]:
string = secrets_file.read()

In [4]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        print(line.split(':'))
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()

['clientid', '29e4b27d081e4557a46599dcfeacd968']
['clientsecret', '543d55c868cb4f7b879ee416d69ec060']


In [5]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))

#### Getting tracks. Spotify playlist chosen: The Longest Playlist Ever (id: 6yPiKpy7evrwvZodByKvM9).

In [6]:
from random import randint
from time import sleep

def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,2500)/1000)
    return tracks

In [7]:
all_tracks = get_playlist_tracks("6yPiKpy7evrwvZodByKvM9")
len(all_tracks)

10000

In [111]:
# all_tracks

In [10]:
tracks = json_normalize(all_tracks)

In [110]:
tracks.head(3)

Unnamed: 0,added_at,is_local,primary_color,added_by.external_urls.spotify,added_by.href,added_by.id,added_by.type,added_by.uri,track.album.album_type,track.album.artists,...,track.name,track.popularity,track.preview_url,track.track,track.track_number,track.type,track.uri,video_thumbnail.url,artist_dict,artists_dfs
0,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,2K,0,,True,17,track,spotify:track:33xMbeHzmWd6Od0BmLZEUs,,{0: {'external_urls': {'spotify': 'https://ope...,externa...
1,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,4 Billion Souls,25,https://p.scdn.co/mp3-preview/d6645e0eeb0f6849...,True,2,track,spotify:track:3UnyplmZaq547hwsfOR5yy,,{0: {'external_urls': {'spotify': 'https://ope...,externa...
2,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,4 Minute Warning,0,,True,8,track,spotify:track:1w8QCSDH4QobcQeT4uMKLm,,{0: {'external_urls': {'spotify': 'https://ope...,externa...


In [160]:
# tracks.isna().sum()

Removing duplicate track ids.

In [13]:
tracks = tracks[tracks['track.id'].isna() == False]

In [14]:
len(tracks)

9918

In [109]:
# tracks

Converting artist lists of each track to dictionaries.

In [15]:
def list_to_dict(x):
    return {i: x[i] for i in range(len(x))}

In [113]:
tracks['artist_dict'] = tracks['track.artists'].apply(list_to_dict)
tracks.head(3)

Unnamed: 0,added_at,is_local,primary_color,added_by.external_urls.spotify,added_by.href,added_by.id,added_by.type,added_by.uri,track.album.album_type,track.album.artists,...,track.name,track.popularity,track.preview_url,track.track,track.track_number,track.type,track.uri,video_thumbnail.url,artist_dict,artists_dfs
0,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,2K,0,,True,17,track,spotify:track:33xMbeHzmWd6Od0BmLZEUs,,{0: {'external_urls': {'spotify': 'https://ope...,externa...
1,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,4 Billion Souls,25,https://p.scdn.co/mp3-preview/d6645e0eeb0f6849...,True,2,track,spotify:track:3UnyplmZaq547hwsfOR5yy,,{0: {'external_urls': {'spotify': 'https://ope...,externa...
2,2017-02-27T01:38:09Z,False,,https://open.spotify.com/user/12160726861,https://api.spotify.com/v1/users/12160726861,12160726861,user,spotify:user:12160726861,album,[{'external_urls': {'spotify': 'https://open.s...,...,4 Minute Warning,0,,True,8,track,spotify:track:1w8QCSDH4QobcQeT4uMKLm,,{0: {'external_urls': {'spotify': 'https://ope...,externa...


In [112]:
# tracks['artist_dict']

Converting artist dicts of each track to dataframes.

In [18]:
def expand_list_dict(row):
    df = pd.DataFrame.from_dict(row['artist_dict'], orient='index')
    df['song_id'] = row['track.id']
    return df

tracks['artists_dfs'] = tracks.apply(expand_list_dict, axis=1)
type(tracks['artists_dfs'][0])

pandas.core.frame.DataFrame

In [19]:
tracks['artists_dfs'][0]

Unnamed: 0,external_urls,href,id,name,type,uri,song_id
0,{'spotify': 'https://open.spotify.com/artist/0...,https://api.spotify.com/v1/artists/0IVapwlnM3d...,0IVapwlnM3dEOiMsHXsghT,Nosaj Thing,artist,spotify:artist:0IVapwlnM3dEOiMsHXsghT,33xMbeHzmWd6Od0BmLZEUs


In [114]:
artist_df = pd.DataFrame(columns=['external_urls', 'href', 'id', 'name', 'type', 'uri', 'song_id'])
for mini_df in tracks['artists_dfs']:
    artist_df = pd.concat([artist_df, mini_df], axis=0)

In [117]:
# artist_df

In [118]:
df_merged = pd.merge(left=tracks,
                    right=artist_df,
                    how='inner',
                    left_on='track.id',
                    right_on='song_id')

In [119]:
df1 = df_merged[['track.name', 'song_id', 'name', 'id']]

In [120]:
df1.shape

(11869, 4)

In [121]:
df1.isna().sum()

track.name    0
song_id       0
name          0
id            0
dtype: int64

In [122]:
# df1['name'].value_counts()

In [123]:
# df1['song_id'].value_counts()

#### Getting audio features.

In [131]:
chunks = [(i, i+100) for i in range(0, len(df1), 100)]
audio_features_list = []
for chunk in chunks:
    id_list100 = df1['song_id'][chunk[0]:chunk[1]]
    # id_list100 = [id for id in id_list100 if id is not None]
    audio_features_list = audio_features_list + sp.audio_features(id_list100)
    sleep(randint(1,2500)/1000)
len(audio_features_list)

11869

In [126]:
# audio_features_list

In [132]:
audio_features_df = json_normalize(audio_features_list)

In [133]:
len(df1)

11869

In [134]:
audio_features_df2 = audio_features_df.drop_duplicates()

In [135]:
len(audio_features_df2)

9894

In [136]:
df_audio_ft = pd.merge(left=df1,
                        right=audio_features_df2,
                        how='inner',
                        left_on='song_id',
                        right_on='id')

In [137]:
len(df_audio_ft)

11869

#### We could try to expand it, obtaining the top10 songs in GB for each artist (example with the first artist id).

In [142]:
tracks_t = sp.artist_top_tracks('0IVapwlnM3dEOiMsHXsghT', country='GB')

In [141]:
# tracks_t

In [145]:
tracks_t = json_normalize(tracks_t['tracks'])

In [None]:
# tracks_t

In [143]:
# tracks_t.columns

In [None]:
# tracks_t['id']

In [None]:
# tracks_t['artists'][1]

In [None]:
type(tracks_t['artists'])

pandas.core.series.Series

In [146]:
tracks_t['artist_dict'] = tracks_t['artists'].apply(list_to_dict)

In [147]:
def expand_artist_dict(row):
    df = pd.DataFrame.from_dict(row['artist_dict'], orient='index')
    df['song_id'] = row['id']
    return df

tracks_t['artists_dfs'] = tracks_t.apply(expand_artist_dict, axis=1)

In [148]:
type(tracks_t['artists_dfs'][0])

pandas.core.frame.DataFrame

In [150]:
# tracks_t['artists_dfs'][0]

In [151]:
artist_df_t = pd.DataFrame(columns=['external_urls', 'href', 'id', 'name', 'type', 'uri', 'song_id'])
for mini_df in tracks_t['artists_dfs']:
    artist_df_t = pd.concat([artist_df_t, mini_df], axis=0)

In [152]:
# artist_df_t

In [153]:
df_merged_t = pd.merge(left=tracks_t,
                    right=artist_df_t,
                    how='inner',
                    left_on='id',
                    right_on='song_id')

In [154]:
# df_merged_t

In [155]:
df1_t = df_merged_t[['name_x', 'name_y', 'song_id']]

In [156]:
df1_t

Unnamed: 0,name_x,name_y,song_id
0,Aquarium,Nosaj Thing,4ZOv9mx6Lz7RRaBFi5UNaE
1,Blue Hour,Nosaj Thing,6f9BnONLZr0rpB5thUzzlW
2,Blue Hour,Julianna Barwick,6f9BnONLZr0rpB5thUzzlW
3,Too Close,Nosaj Thing,0FlYCmQM5q3TcdvwRT53Tt
4,Too Close,Jacques Greene,0FlYCmQM5q3TcdvwRT53Tt
5,Too Close,Ouri,0FlYCmQM5q3TcdvwRT53Tt
6,CLOUDS,박혜진 Park Hye Jin,11dTnaie5BRo7EFXuVARSy
7,CLOUDS,Nosaj Thing,11dTnaie5BRo7EFXuVARSy
8,In Your Eyes (feat. Charlotte Day Wilson) [Nos...,BADBADNOTGOOD,28atUbzCFihH4GA3dU1gVM
9,In Your Eyes (feat. Charlotte Day Wilson) [Nos...,Charlotte Day Wilson,28atUbzCFihH4GA3dU1gVM


In [157]:
# df1_artists = df1.drop_duplicates(subset='id')

In [158]:
# df1_artists

In [159]:
# top10_list = []
# for i in range(len(df1)):
#     top_tracks = sp.artist_top_tracks(df1['id'][i], country='GB')
#     top10_list.extend(top_tracks)
#     sleep(randint(1, 2500) / 1000)