In [77]:
import requests #package for http requests
import bs4
# from bs4 import BeautifulSoup # package for html parsing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import timedelta, date

In [78]:
from fycharts.SpotifyCharts import SpotifyCharts
import sqlalchemy
import threading

In [79]:
#the following two functions were taken from this website. we have modified them slightly
# https://gist.github.com/hktosun/d4f98488cb8f005214acd12296506f48
def daterange(start_date, end_date):
    '''
    helper function for create_links
    '''
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# It creates the list of page links we will get the data from.
def create_links_2020(country):
    '''
    returns an array of strings, which are 365 links to spotify charts for each day in 2020 for a specified country.
    parameter: a string, postal id for a country. eg. to get 365 links for the united states, call create_links('us')
    '''
    start_date = date(2020, 1, 1)
    end_date = date(2020,12,31)
    links = []
    dates = daterange(start_date, end_date)
    for single_date in daterange(start_date, end_date):
        links.append('https://spotifycharts.com/regional/' + country + '/daily/' + single_date.strftime("%Y-%m-%d"))
    return(links)

In [80]:
# #demonstration of code
# links_us=create_links_2020('us')
# print(len(links_us))
# links_us[0]

In [3]:
def create_links_2017(country):
    start_date = date(2017, 1, 1)
    end_date = date(2017,12,31)
    links = []
    dates = daterange(start_date, end_date)
    for single_date in daterange(start_date, end_date):
        links.append('https://spotifycharts.com/regional/' + country + '/daily/' + single_date.strftime("%Y-%m-%d"))
    return(links)

In [82]:
def get_table(link):
    '''
    a function that returns a table of the top 50 songs from a spotify chart
    with the following columns:
    track. a string, "[track name] by [artist]"
    position. an int, the ranking of the track on the chart.
    streams. an int, # of streams of the track on a specified date.
    date. a date time obj. the day the song was streamed.
    url. a url to the song on Spotify.
    
    argument: a string, which is a link to a Spotify chart. eg. 'https://spotifycharts.com/regional/us/daily/2020-01-01'
    '''
    #getting the df
    scraper = cloudscraper.create_scraper()
    r = scraper.get(link)
    df_list = pd.read_html(r.text) # this parses all the tables in webpages to a lis
    df = df_list[0]
    #for the purposes of our project, we only want the top 50 songs of the table that was scraped from the
    #spotify website, which gives us top 200. we find that 200 songs per day was excessive
    df=df.head(50)

    #cleaning column names and dropping irrelevant columns
    #to lowercase
    cols= [x.lower() for x in df.columns] 
    df.columns=cols
    #after scraping, Spotify returns a column 'unnamed: 1', which is actually the position of the song on the chart.
    #which we have renamed
    df = df.rename(columns={'unnamed: 1': 'position'})
    #it also returns two NaN columns, which we drop here
    df=df.drop(columns=["unnamed: 0","unnamed: 2"])
    
    #adding a date column by parsing the link used in the argument.
    df["date"]=link[-10:]
    df['date']=pd.to_datetime(df['date'])
    
    #creating the url column
    #using Beautiful soup to webscrape the links on the site.
    datasoup = bs4.BeautifulSoup(r.text, 'html.parser')
    aList=[]
    #find all 'a' tags
    for data in datasoup.findAll('a'):
        aList.append(data)
    links=[]
    #find all href tags
    for x in aList:
        links.append(x.get('href'))
    #drop None types
    clean = filter(None, links)
    urls=[]
    #only keep the links that are links to spotify tracks.
    for x in clean:
        if 'open.spotify.com/track/' in x:
            urls.append(x)
    #only append the top 50 to the dataframe
    df['url']=urls[:50]
    return df

In [83]:
def track_artist(df):
    '''
    returns a df that takes "track" column, which contains both the artist and song. 
    it is a string formatted as "[track name] by [artist]"
    and parses it into 2 columns: track_name and artist.
    and deletes the track column after.
    
    argument: a dataframe created by get_table(), containing a column "track"
    with each observation a string formatted as "[track name] by [artist]"
    '''
    tracks=df.track.values
    songtitles=[]
    songartists=[]
    for song in tracks:
        index1=song.find("  by")
        index2=song.find("by ")
        track=song[:index1]
        songtitles.append(track)
        artist=song[index2+3:]
        songartists.append(artist)
    df["track_name"]=songtitles
    df["artist"]=songartists
    df=df.drop(columns=["track"])
    return df

In [84]:
allcountries=["United States", "United Kingdom", "United Arab Emirates", "Argentina", "Austria", "Australia", "Belgium", "Bulgaria", "Bolivia", "Brazil", "Canada", "Switzerland", "Chile", "Colombia", "Costa Rica", "Cyprus", "Czech Republic", "Germany", "Denmark", "Dominican Republic", "Ecuador", "Estonia", "Egypt", "Spain", "Finland", "France", "Greece", "Guatemala", "Hong Kong", "Honduras", "Hungary", "Indonesia", "Ireland", "Israel", "India", "Iceland", "Italy", "Japan", "Republic of Korea", "Lithuania", "Luxembourg", "Latvia", "Morocco", "Mexico", "Malaysia", "Nicaragua", "Netherlands", "Norway", "New Zealand", "Panama", "Peru", "Philippines", "Poland", "Portugal", "Paraguay", "Romania", "Russia", "Saudi Arabia", "Sweden", "Singapore", "Slovakia", "El Salvador", "Thailand", "Turkey", "Taiwan", "Ukraine", "Uruguay", "Vietnam", "South Africa"
]

In [85]:
happy2020.head()

Unnamed: 0,country_name,regional_indicator,ladder_score,standard_error_of_ladder_score,upperwhisker,lowerwhisker,logged_gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption,ladder_score_in_dystopia,explained_by:_log_gdp_per_capita,explained_by:_social_support,explained_by:_healthy_life_expectancy,explained_by:_freedom_to_make_life_choices,explained_by:_generosity,explained_by:_perceptions_of_corruption,dystopia_+_residual
0,Finland,Western Europe,7.8087,0.031156,7.869766,7.747634,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,1.972317,1.28519,1.499526,0.961271,0.662317,0.15967,0.477857,2.762835
1,Denmark,Western Europe,7.6456,0.033492,7.711245,7.579955,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,1.972317,1.326949,1.503449,0.979333,0.66504,0.242793,0.49526,2.432741
2,Switzerland,Western Europe,7.5599,0.035014,7.628528,7.491272,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,1.972317,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946,2.350267
3,Iceland,Western Europe,7.5045,0.059616,7.621347,7.387653,10.772559,0.97467,73.0,0.948892,0.246944,0.71171,1.972317,1.326502,1.547567,1.000843,0.661981,0.36233,0.144541,2.460688
4,Norway,Western Europe,7.488,0.034837,7.556281,7.419719,11.087804,0.952487,73.200783,0.95575,0.134533,0.263218,1.972317,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101,2.168266


In [86]:
happy2020=pd.read_csv('happiness2020.csv')
# cols2020= [x.lower() for x in happy2020.columns] 
# cols2020= [x.replace(" ","_") for x in cols2020] 
happy2020.columns=cols2020
countries2020=happy2020.country_name.to_numpy()
countries2020
# countries2020= [x.lower() for x in countries2020] 
# countries2020= [x.replace(" ","") for x in countries2020]
new=[]
for x in countries2020:
    if x in allcountries:
        new.append(x)
topbot2020=new[:10]+new[-10:]
topbot2020

['Finland',
 'Denmark',
 'Switzerland',
 'Iceland',
 'Norway',
 'Netherlands',
 'Sweden',
 'New Zealand',
 'Austria',
 'Luxembourg',
 'Malaysia',
 'Vietnam',
 'Indonesia',
 'Turkey',
 'Bulgaria',
 'Morocco',
 'South Africa',
 'Ukraine',
 'Egypt',
 'India']

In [87]:
country_id2020=['fi','dk','ch','is','no','nl','se','nz','at','lu','my','vn','id','tr','bg','ma','za','ua','eg','in']

In [73]:
happy2017

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,Iceland,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


In [88]:
region_id2017=['no', 'dk', 'ch', 'fi', 'nl', 'se', 'ee', 'hu', 'id', 'is', 'ca',
       'nz', 'au', 'tr', 'py', 'ph', 'do', 'gy', 'pt', 'hn']

In [89]:
region_id2020=['no', 'dk', 'ch', 'fi', 'nl', 'se', 'ee', 'hu', 'id', 'is', 'ca',
       'nz', 'au', 'tr', 'py', 'ph', 'do', 'gy', 'pt', 'hn']

In [90]:
# #Eva run this one
# spotify2017=pd.DataFrame()
# for region in region_id2017:
#     links=create_links_2017(region)
#     for link in links:
#         df=get_table(link)
#         df=track_artist(df)
#         df['region']=region
#         spotify2020=spotify2020.append(df,ignore_index=True)

In [91]:
#Estelle run this one
spotify2020=pd.DataFrame()
for region in region_id2020:
    links=create_links_2020(region)
    for link in links:
        df=get_table(link)
        df=track_artist(df)
        df['region']=region
        spotify2020=spotify2020.append(df,ignore_index=True)

KeyboardInterrupt: 

In [None]:
# spotify2020

In [17]:
# #we went to office hours to investigate the "no tables found" error and he asked us to write this code
# #to see what was happening

# url = "https://spotifycharts.com/regional/no/daily/2020-01-01"
# r = requests.get(url)
# soup=BeautifulSoup(r.text, 'html.parser')
# # df_list = pd.read_html(r.text) # this parses all the tables in webpages to a list
# # df = df_list[0]
# # df.head()
# print(soup.find_all('table'))
# print(soup.body.text)

[]


Please enable cookies.


One more step
Please complete the security check to access spotifycharts.com















Please stand by, while we are checking your browser...
Redirecting...





Please turn JavaScript on and reload the page.


Please enable Cookies and reload the page.


















Why do I have to complete a CAPTCHA?

Completing the CAPTCHA proves you are a human and gives you temporary access to the web property.


What can I do to prevent this in the future?
If you are on a personal connection, like at home, you can run an anti-virus scan on your device to make sure it is not infected with malware.
If you are at an office or shared network, you can ask the network administrator to run a scan across the network looking for misconfigured or infected devices.





Cloudflare Ray ID: 646b7e2c3fe61e91
•
Your IP: 24.97.110.217
•
Performance & security by Cloudflare








In [None]:
api=SpotifyCharts()
api.top200Daily(output_file='no_top_200_daily.csv',start='2020-01-01',end='2020-12-31',region='no')

INFO : 27/04/2021 06:13:50 PM : Extracting top 200 daily for 2020-01-01 - no
ERROR : 27/04/2021 06:13:55 PM : ***** <<HTTPSConnectionPool(host='spotifycharts.com', port=443): Max retries exceeded with url: /regional/no/daily/2020-01-01/download (Caused by ResponseError('too many 503 error responses'))>> Data not found. Generating empty dataframe *****
INFO : 27/04/2021 06:13:55 PM : Extracting top 200 daily for 2020-01-02 - no
INFO : 27/04/2021 06:13:55 PM : Appending data to the file no_top_200_daily.csv...
INFO : 27/04/2021 06:13:55 PM : Done appending to the file no_top_200_daily.csv!!!
ERROR : 27/04/2021 06:14:00 PM : ***** <<HTTPSConnectionPool(host='spotifycharts.com', port=443): Max retries exceeded with url: /regional/no/daily/2020-01-02/download (Caused by ResponseError('too many 503 error responses'))>> Data not found. Generating empty dataframe *****
INFO : 27/04/2021 06:14:00 PM : Extracting top 200 daily for 2020-01-03 - no
INFO : 27/04/2021 06:14:00 PM : Appending data to

In [5]:
import cloudscraper

scraper = cloudscraper.create_scraper()  # returns a CloudScraper instance
# Or: scraper = cloudscraper.CloudScraper()  # CloudScraper inherits from requests.Session
r=scraper.get("https://spotifycharts.com/regional/no/daily/2020-04-17")
df_list = pd.read_html(r.text) # this parses all the tables in webpages to a list
df = df_list[0]
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Track,Streams
0,,1,,Karantene by TIX,124824
1,,2,,Blinding Lights by The Weeknd,120199
2,,3,,Svag by Victor Leksell,118469
3,,4,,Toosie Slide by Drake,105361
4,,5,,Roses - Imanbek Remix by SAINt JHN,102318


In [26]:
p=get_table("https://spotifycharts.com/regional/no/daily/2020-04-17")

In [27]:
p

Unnamed: 0,position,track,streams,date,url
0,1,Karantene by TIX,124824,2020-04-17,https://open.spotify.com/track/6sMMfGxa9tB59dM...
1,2,Blinding Lights by The Weeknd,120199,2020-04-17,https://open.spotify.com/track/0VjIjW4GlUZAMYd...
2,3,Svag by Victor Leksell,118469,2020-04-17,https://open.spotify.com/track/5SY5BWTxbDqFouu...
3,4,Toosie Slide by Drake,105361,2020-04-17,https://open.spotify.com/track/127QTOFJsJQp5Lb...
4,5,Roses - Imanbek Remix by SAINt JHN,102318,2020-04-17,https://open.spotify.com/track/24Yi9hE78yPEbZ4...
5,6,Kaller På Deg by TIX,94415,2020-04-17,https://open.spotify.com/track/0lnksmEu1sa7t16...
6,7,Kings & Queens by Ava Max,78550,2020-04-17,https://open.spotify.com/track/76nqCfJOcFFWBJN...
7,8,"Like It Is by Kygo, Zara Larsson, Tyga",77189,2020-04-17,https://open.spotify.com/track/3frUvGrmGcay91l...
8,9,"Freedom by Kygo, Zak Abel",76782,2020-04-17,https://open.spotify.com/track/5Gj1wG8b12VQdEd...
9,10,"End of Time by K-391, Alan Walker, Ahrix",70723,2020-04-17,https://open.spotify.com/track/67O8CWXxPsfz8or...
