In [1]:
import requests #package for http requests
import bs4
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import timedelta, date
import cloudscraper

In [30]:
#the following two functions were taken from this website. we have modified them slightly
# https://gist.github.com/hktosun/d4f98488cb8f005214acd12296506f48
def daterange(start_date, end_date):
    '''
    helper function for create_links
    '''
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# It creates the list of page links we will get the data from.
def create_links_2020(country):
    '''
    returns an array of strings, which are 365 links to spotify charts for each day in 2020 for a specified country.
    parameter: a string, postal id for a country. eg. to get 365 links for the united states, call create_links('us')
    '''
    start_date = date(2020, 1, 1)
    end_date = date(2020,12,31)
    links = []
    dates = daterange(start_date, end_date)
    for single_date in daterange(start_date, end_date):
        links.append('https://spotifycharts.com/regional/' + country + '/daily/' + single_date.strftime("%Y-%m-%d"))
    return(links)

In [31]:
#demonstration of code
links_us=create_links_2020('us')
print(len(links_us))
links_us[0]

365


'https://spotifycharts.com/regional/us/daily/2020-01-01'

In [4]:
# def create_links_2017(country):
#     start_date = date(2017, 01, 01)
#     end_date = date(2017,12,31)
#     links = []
#     dates = daterange(start_date, end_date)
#     for single_date in daterange(start_date, end_date):
#         links.append('https://spotifycharts.com/regional/' + country + '/daily/' + single_date.strftime("%Y-%m-%d"))
#     return(links)

In [5]:
def get_table(link):
    '''
    a function that returns a table of the top 50 songs from a spotify chart
    with the following columns:
    track. a string, "[track name] by [artist]"
    position. an int, the ranking of the track on the chart.
    streams. an int, # of streams of the track on a specified date.
    date. a date time obj. the day the song was streamed.
    url. a url to the song on Spotify.
    
    argument: a string, which is a link to a Spotify chart. eg. 'https://spotifycharts.com/regional/us/daily/2020-01-01'
    '''
    #getting the df
    scraper = cloudscraper.create_scraper()
    r = scraper.get(link)
    df_list = pd.read_html(r.text) # this parses all the tables in webpages to a lis
    df = df_list[0]
    #for the purposes of our project, we only want the top 50 songs of the table that was scraped from the
    #spotify website, which gives us top 200. we find that 200 songs per day was excessive
    df=df.head(50)

    #cleaning column names and dropping irrelevant columns
    #to lowercase
    cols= [x.lower() for x in df.columns] 
    df.columns=cols
    #after scraping, Spotify returns a column 'unnamed: 1', which is actually the position of the song on the chart.
    #which we have renamed
    df = df.rename(columns={'unnamed: 1': 'position'})
    #it also returns two NaN columns, which we drop here
    df=df.drop(columns=["unnamed: 0","unnamed: 2"])
    
    #adding a date column by parsing the link used in the argument.
    df["date"]=link[-10:]
    df['date']=pd.to_datetime(df['date'])
    
    #creating the url column
    #using Beautiful soup to webscrape the links on the site.
    datasoup = bs4.BeautifulSoup(r.text, 'html.parser')
    aList=[]
    #find all 'a' tags
    for data in datasoup.findAll('a'):
        aList.append(data)
    links=[]
    #find all href tags
    for x in aList:
        links.append(x.get('href'))
    #drop None types
    clean = filter(None, links)
    urls=[]
    #only keep the links that are links to spotify tracks.
    for x in clean:
        if 'open.spotify.com/track/' in x:
            urls.append(x)
    #only append the top 50 to the dataframe
    df['url']=urls[:50]
    return df

In [6]:
def track_artist(df):
    '''
    returns a df that takes "track" column, which contains both the artist and song. 
    it is a string formatted as "[track name] by [artist]"
    and parses it into 2 columns: track_name and artist.
    and deletes the track column after.
    
    argument: a dataframe created by get_table(), containing a column "track"
    with each observation a string formatted as "[track name] by [artist]"
    '''
    tracks=df.track.values
    songtitles=[]
    songartists=[]
    for song in tracks:
        index1=song.find("  by")
        index2=song.find("by ")
        track=song[:index1]
        songtitles.append(track)
        artist=song[index2+3:]
        songartists.append(artist)
    df["track_name"]=songtitles
    df["artist"]=songartists
    df=df.drop(columns=["track"])
    return df

In [7]:
allcountries=["United States", "United Kingdom", "United Arab Emirates", "Argentina", "Austria", "Australia", "Belgium", "Bulgaria", "Bolivia", "Brazil", "Canada", "Switzerland", "Chile", "Colombia", "Costa Rica", "Cyprus", "Czech Republic", "Germany", "Denmark", "Dominican Republic", "Ecuador", "Estonia", "Egypt", "Spain", "Finland", "France", "Greece", "Guatemala", "Hong Kong", "Honduras", "Hungary", "Indonesia", "Ireland", "Israel", "India", "Iceland", "Italy", "Japan", "Republic of Korea", "Lithuania", "Luxembourg", "Latvia", "Morocco", "Mexico", "Malaysia", "Nicaragua", "Netherlands", "Norway", "New Zealand", "Panama", "Peru", "Philippines", "Poland", "Portugal", "Paraguay", "Romania", "Russia", "Saudi Arabia", "Sweden", "Singapore", "Slovakia", "El Salvador", "Thailand", "Turkey", "Taiwan", "Ukraine", "Uruguay", "Vietnam", "South Africa"
]

In [10]:
happy2020=pd.read_csv("happiness2020.csv")
happy2020.head()

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.8087,0.031156,7.869766,7.747634,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,1.972317,1.28519,1.499526,0.961271,0.662317,0.15967,0.477857,2.762835
1,Denmark,Western Europe,7.6456,0.033492,7.711245,7.579955,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,1.972317,1.326949,1.503449,0.979333,0.66504,0.242793,0.49526,2.432741
2,Switzerland,Western Europe,7.5599,0.035014,7.628528,7.491272,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,1.972317,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946,2.350267
3,Iceland,Western Europe,7.5045,0.059616,7.621347,7.387653,10.772559,0.97467,73.0,0.948892,0.246944,0.71171,1.972317,1.326502,1.547567,1.000843,0.661981,0.36233,0.144541,2.460688
4,Norway,Western Europe,7.488,0.034837,7.556281,7.419719,11.087804,0.952487,73.200783,0.95575,0.134533,0.263218,1.972317,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101,2.168266


In [12]:
new

['Finland',
 'Denmark',
 'Switzerland',
 'Iceland',
 'Norway',
 'Netherlands',
 'Sweden',
 'New Zealand',
 'Austria',
 'Luxembourg',
 'Canada',
 'Australia',
 'United Kingdom',
 'Israel',
 'Costa Rica',
 'Ireland',
 'Germany',
 'United States',
 'Czech Republic',
 'Belgium',
 'United Arab Emirates',
 'France',
 'Mexico',
 'Uruguay',
 'Saudi Arabia',
 'Spain',
 'Guatemala',
 'Italy',
 'Singapore',
 'Brazil',
 'El Salvador',
 'Panama',
 'Slovakia',
 'Chile',
 'Lithuania',
 'Poland',
 'Colombia',
 'Cyprus',
 'Nicaragua',
 'Romania',
 'Estonia',
 'Philippines',
 'Hungary',
 'Thailand',
 'Argentina',
 'Honduras',
 'Latvia',
 'Ecuador',
 'Portugal',
 'Japan',
 'Peru',
 'Bolivia',
 'Paraguay',
 'Dominican Republic',
 'Russia',
 'Greece',
 'Malaysia',
 'Vietnam',
 'Indonesia',
 'Turkey',
 'Bulgaria',
 'Morocco',
 'South Africa',
 'Ukraine',
 'Egypt',
 'India']

In [11]:
cols2020= [x.lower() for x in happy2020.columns] 
cols2020= [x.replace(" ","_") for x in cols2020] 
happy2020.columns=cols2020
countries2020=happy2020.country_name.to_numpy()
new=[]
for x in countries2020:
    if x in allcountries:
        new.append(x)
topbot2020=new[:10]+new[-10:]
topbot2020

['Finland',
 'Denmark',
 'Switzerland',
 'Iceland',
 'Norway',
 'Netherlands',
 'Sweden',
 'New Zealand',
 'Austria',
 'Luxembourg',
 'Malaysia',
 'Vietnam',
 'Indonesia',
 'Turkey',
 'Bulgaria',
 'Morocco',
 'South Africa',
 'Ukraine',
 'Egypt',
 'India']

In [None]:
# region_id2020=['fi','dk','ch','is','no','nl','se','nz','at','lu','my','vn','id','tr','bg','ma','za','ua','eg','in']

In [None]:
# happy2017

In [None]:
# region_id2017=['no', 'dk', 'ch', 'fi', 'nl', 'se', 'ee', 'hu', 'id', 'is', 'ca',
#        'nz', 'au', 'tr', 'py', 'ph', 'do', 'gy', 'pt', 'hn']

In [None]:
# #Eva run this one
# spotify2017=pd.DataFrame()
# for region in region_id2017:
#     links=create_links_2017(region)
#     for link in links:
#         df=get_table(link)
#         df=track_artist(df)
#         df['region']=region
#         spotify2020=spotify2020.append(df,ignore_index=True)

In [None]:
# #Estelle run this one
# spotify2020=pd.DataFrame()
# for region in region_id2020:
#     links=create_links_2020(region)
#     for link in links:
#         df=get_table(link)
#         df=track_artist(df)
#         df['region']=region
#         print(region)
#         spotify2020=spotify2020.append(df,ignore_index=True)
# spotify2020

In [None]:
# ca=create_links_2020("ca")
# canada=pd.DataFrame()

In [None]:
# for link in resume:
#     df=get_table(link)
#     df=track_artist(df)
#     df['region']="ca"
#     canada=canada.append(df,ignore_index=True)
#     print(link)
# canada

In [None]:
# resume=create_links_2020("ca")
# resume

In [None]:
# region_id2020=['fi','dk','ch','is','no','nl','se','nz','at','lu','my','vn','id','tr','bg','ma','za','ua','eg','in']

In [15]:
finland=finland.drop(columns=["Unnamed: 0","Unnamed: 0.1"])

In [16]:
finland

Unnamed: 0,position,streams,date,url,track_name,artist,region
0,1,39681,2020-01-01,https://open.spotify.com/track/0tc8HGXosQDC8TT...,Hei rakas,BEHM,fi
1,2,31179,2020-01-01,https://open.spotify.com/track/3eHkFrUUGYuwEgI...,Pintakaasulla,JVG,fi
2,3,30339,2020-01-01,https://open.spotify.com/track/4gEvvWYAF3yzv9h...,Luota Muhun,ibe,fi
3,4,30208,2020-01-01,https://open.spotify.com/track/0sf12qNH5qcw8qp...,Blinding Lights,The Weeknd,fi
4,5,28966,2020-01-01,https://open.spotify.com/track/1rgnBhdG2JDFTbY...,Dance Monkey,Tones And I,fi
...,...,...,...,...,...,...,...
18245,46,13274,2020-12-30,https://open.spotify.com/track/1rYP6R16sM0uSlx...,Silmät,Gettomasa,fi
18246,47,13190,2020-12-30,https://open.spotify.com/track/1z4jCtA9thODROr...,Japanese Drip,"Kube, Cledos",fi
18247,48,13033,2020-12-30,https://open.spotify.com/track/4ycKgmGOtvuJZVP...,Ikuinen vappu,JVG,fi
18248,49,12942,2020-12-30,https://open.spotify.com/track/223VlHsryk8Vt7V...,UUDESTAAN,Petri Nygård,fi


In [17]:
finland.to_csv('finland2020.csv')

In [21]:
finland=pd.read_csv("finland2020.csv")
denmark=pd.read_csv("denmark2020.csv")
switzerland=pd.read_csv("switzerland2020.csv")
iceland=pd.read_csv("iceland2020.csv")
norway=pd.read_csv("norway2020.csv")
netherlands=pd.read_csv("netherlands2020.csv")
sweden=pd.read_csv("sweden2020.csv")
newzealand=pd.read_csv("newzealand2020.csv")
austria=pd.read_csv("austria2020.csv")
canada=pd.read_csv("canada2020.csv")

In [24]:
spotify2020=spotify2020.drop(columns=["Unnamed: 0"])

In [32]:
spotify2020

Unnamed: 0,position,streams,date,url,track_name,artist,region
0,1,39681,2020-01-01,https://open.spotify.com/track/0tc8HGXosQDC8TT...,Hei rakas,BEHM,fi
1,2,31179,2020-01-01,https://open.spotify.com/track/3eHkFrUUGYuwEgI...,Pintakaasulla,JVG,fi
2,3,30339,2020-01-01,https://open.spotify.com/track/4gEvvWYAF3yzv9h...,Luota Muhun,ibe,fi
3,4,30208,2020-01-01,https://open.spotify.com/track/0sf12qNH5qcw8qp...,Blinding Lights,The Weeknd,fi
4,5,28966,2020-01-01,https://open.spotify.com/track/1rgnBhdG2JDFTbY...,Dance Monkey,Tones And I,fi
...,...,...,...,...,...,...,...
182445,46,58237,2020-12-30,https://open.spotify.com/track/6f3Slt0GbA2bPZl...,The Business,Tiësto,ca
182446,47,57657,2020-12-30,https://open.spotify.com/track/7kDUspsoYfLkWnZ...,my ex's best friend (with blackbear),Machine Gun Kelly,ca
182447,48,57321,2020-12-30,https://open.spotify.com/track/21jGcNKet2qwijl...,Circles,Post Malone,ca
182448,49,57003,2020-12-30,https://open.spotify.com/track/0A1hoCfMLkiAgvh...,Body,Megan Thee Stallion,ca


In [22]:
spotify2020=pd.DataFrame()
spotify2020=spotify2020.append(finland,ignore_index=True)
spotify2020=spotify2020.append(denmark,ignore_index=True)
spotify2020=spotify2020.append(switzerland,ignore_index=True)
spotify2020=spotify2020.append(iceland,ignore_index=True)
spotify2020=spotify2020.append(norway,ignore_index=True)
spotify2020=spotify2020.append(netherlands,ignore_index=True)
spotify2020=spotify2020.append(sweden,ignore_index=True)
spotify2020=spotify2020.append(newzealand,ignore_index=True)
spotify2020=spotify2020.append(austria,ignore_index=True)
spotify2020=spotify2020.append(canada,ignore_index=True)

In [28]:
spotify2020.to_csv("topspotify2020.csv")

In [33]:
botspotify2020=pd.read_csv("botspotify2020.csv")

In [35]:
botspotify2020=botspotify2020.drop(columns=["Unnamed: 0"])

In [37]:
spotify2020=spotify2020.append(botspotify2020,ignore_index=True)

In [39]:
spotify2020.region.unique()

array(['fi', 'dk', 'ch', 'is', 'no', 'nl', 'se', 'nz', 'at', 'ca', 'do',
       'gr', 'my', 'vn', 'id', 'tr', 'ma', 'za', 'eg', 'in'], dtype=object)

In [40]:
spotify2020.to_csv("spotify2020.csv")