# Section 4: API Pull

In [None]:
import warnings
warnings.filterwarnings('ignore') #Ignores warning messages that take up a lot of space
import pandas as pd
import requests

def get_id_list(limit):
    #Objective: get a list of the most popular musical artists' Spotify ID's from Spotify.
    #We search for specific characters and then extract the first results using Spotify's search.
    #Input (limit) is the amount of artists we want to extract for each letter.
    
    url = "https://spotify23.p.rapidapi.com/search/" #API location
    
    error_duplicates = 0 #For future error tracking

    alphabet = list('abcdefghijklmnopqrstuvwxyz') #Generate list of all alphabet characters
    alphabet2 = [] #Empty list
    for x in alphabet:
        for y in alphabet:
            alphabet2.append(x+y) #Append combination of each character
    
    char = alphabet2  #search words characters
    id_list = [] #List for storage of artist ID's
    for j, i in enumerate(char):
        querystring = {"q": i, "type": "artists", "offset": "0", "limit": limit, "numberOfTopResults": "5"}
        headers = {
            "X-RapidAPI-Key": "a92fd231acmsh813dd13da05c575p1a3cbajsnd7dae53fb312",
            "X-RapidAPI-Host": "spotify23.p.rapidapi.com"
        }
        response = requests.request("GET", url, headers=headers, params=querystring)
        response_json = response.json() #Locating relevant values is easier in JSON

        items = response_json['artists']['items'] #Isolates ID value in the given JSON file
        for x in items:
            id = x['data']['uri'] #Defines location of Spotify ID string
            id_list.append(id[15:]) #Extracts the unique, relevant characters of Spotify ID string by slicing
        if j%50 == 0:
            print(f'Pulled {j+1} of {len(char)}')
    localnum = len(id_list) #For error tracking
    id_list = [*set(id_list)] #Removes duplicate strings in our ID list
    error_duplicates = localnum - len(id_list) #Error tracking by taking difference of values in list before and after removing duplicates
    id_len = len(char)*limit
    
    return id_list, error_duplicates, id_len

#API Pull loop from id_list found in get_id_list
def pull_data(string):
    #Objective: Takes input string (ID from ID list) and returns all information that is
    #displayed on the artist's Spotify page. Returns output in JSON format.
    
    url = "https://spotify23.p.rapidapi.com/artist_overview/" #Same API, just a different endpoint
    querystring = {"id":string}

    headers = {
    "X-RapidAPI-Key": "a92fd231acmsh813dd13da05c575p1a3cbajsnd7dae53fb312",
    "X-RapidAPI-Host": "spotify23.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    response_json = response.json()
    
    return response_json

json_list = [] #Storage of artists' JSON from pull_data function 
limit = 1 #For use in get_id_list function. It is the amount of artists we want for each character.
id_list, error_duplicates, id_len = get_id_list(limit) #We call the get_id_list function

#Create JSON list of artists via pull_data
for i, idx in enumerate(id_list):
    json_list.append(pull_data(idx))
    print(f'API pull number {i+1} of {len(id_list)}') #For monitoring status during API pulls.
print("Success: Pulled all id's from list")

In [None]:
artist_data_indices = ['Name', 
                       'ID', 
                       'Instagram', 
                       'Wikipedia', 
                       'SpotifyFollowers', 
                       'MonthlyListeners', 
                       'Singles', 
                       'TopTrackPlays', 
                       'EarliestAlbum', 
                       'MonthSinceRelease', 
                       'Albums',
                       'AppearsOn', 
                       'DiscoveredOn'] #Names of columns in resulting DataFrame

def get_external(json_file, website):
    #Objective: Return link to artist's external links if they exist, and -1 if it doesn't exist
    #Input: json_file - JSON file of artist's data. website - string of the website we want, and 
    # is either 'instagram' or 'wikipedia'.
    
    #Locate correct path in JSON file. This path leads to a list of dictionaries.
    for item in json_file['data']['artist']['profile']['externalLinks']['items']: #Loop over external links
        for key, value in item.items(): #We are now in the dictionary
            if website in value: #Checks if either 'instagram' or 'wikipedia' is in value of dictionary
                value = value.removesuffix('/?hl=en') #Removes unnecessary suffix from website URL
                return value
    return -1 #If input 'website' is not locatable in artist's external links, return -1

def duplicate_external(dataframe):
    #Objective: Count amount of duplicate external links. For error tracking.
    duplicate_count = 0 #Amount of duplicate external links for artist.
    for row in range(len(result)): #Want to compare a given artist's instagram to wikipedia link, so we get index.
        #In our case, the if statement only evaluates to True if both columns for artist is equal to -1
        if result.iloc[row]['Wikipedia'] == result.iloc[row]['Instagram']:
            duplicate_count += 1 #Iterate error count
    return duplicate_count

def earliest_album_release(json_file):
    #Objective: Get all album releases for given artist, convert values to format that allows us
    #to compare dates and return the lowest value corresponding to the earliest album release
    
    #First we specify where to find the albums
    path = json_file['data']['artist']['discography']['albums']['items']
    if json_file['data']['artist']['discography']['albums']['totalCount'] == 0:
        return -1
    releasedate_list = []
    #Because some artists have multiple albums, we loop over each album to find the release dates
    for i in range(len(path)):
        year = path[i]['releases']['items'][0]['date']['year']
        month = path[i]['releases']['items'][0]['date']['month']
        if month == None:
            month = 12
        #If month contains a single digit we add a leading zero by adding a lacking zero to the year variable
        if month//10 == 0:
            year = int(str(year) + '0')
        day = path[i]['releases']['items'][0]['date']['day']
        if day == None:
            day = 30
        #If day contains a single digit we add a leading zero by adding a lacking zero to the month variable
        if day//10 == 0:
            month = int(str(month) + '0')
        date = int(str(year) + str(month) + str(day))
        releasedate_list.append(date)
    #Return lowest integer value of album release dates
    releasedate_min = min(releasedate_list)
    releasedate_min = str(releasedate_min)[:4]+'-'+str(releasedate_min)[4:6]+'-'+str(releasedate_min)[6:8]
    return releasedate_min

def diff_month(start):
    #Objective: Function returning months since album release. Uses 2022-08-23 as anchor.
    #Input: Date in the format YYYY-MM-DD.
    if start == -1:
        return -1 #If no album release, return -1
    finish = '2022-08-23'
    #Returns 12*year difference + month difference
    return (int(finish[:4]) - int(start[:4]))*12 + int(finish[5:7]) - int(start[5:7])

def toptrack(json_file):
    #Objective: Of all top tracks released by the artist, we want to find the track with the most total plays
    path = json_file['data']['artist']['discography']['topTracks']['items'] #Locate path
    if len(path) == 0:
        return -1
    playcount = [] #List to append playcounts
    for index in range(len(path)): #Loop over all top tracks in list
        playcount.append(int(path[index]['track']['playcount'])) #The value is in string format, so we convert to int
    return max(playcount) #Returns highest playcount of the top tracks

def artist_data(json_file):
    #Returns a list of data of specific artist from JSON file
    resulting_list = []
    resulting_list.append(json_file['data']['artist']['profile']['name']) #Profile name
    resulting_list.append(json_file['data']['artist']['id']) #Spotify ID
    resulting_list.append(get_external(json_file, 'instagram')) #Instagram link
    resulting_list.append(get_external(json_file, 'wikipedia')) #Wikipedia link
    resulting_list.append(json_file['data']['artist']['stats']['followers']) #SpotifyFollowers
    resulting_list.append(json_file['data']['artist']['stats']['monthlyListeners']) #MonthlyListeners
    resulting_list.append(json_file['data']['artist']['discography']['singles']['totalCount']) #Singles
    resulting_list.append(toptrack(json_file)) #TopTrackPlays
    resulting_list.append(earliest_album_release(json_file)) #EarliestAlbum
    resulting_list.append(diff_month(earliest_album_release(json_file)))
    resulting_list.append(json_file['data']['artist']['discography']['albums']['totalCount']) #Albums
    
    related = ['appearsOn', 'discoveredOn'] #Strings for input in JSON pathing
    for string in related:
        resulting_list.append(json_file['data']['artist']['relatedContent'][string]['totalCount']) #Appears- and DiscoveredOn
    return resulting_list

result = pd.DataFrame(columns = artist_data_indices) #DataFrame for all artists' data with column names

error_warnings = 0 #For error tracking.

for i, json_file in enumerate(json_list):
    #Some API pulls result in errors, so we keep track of these and skip them in our iterations
    if ('error' in json_list[i]) or ('errors' in json_list[i]):
        error_warnings += 1
        continue
    if i%1000 == 0:
        print(f'Appended {i*1000} artists to result')
    result.loc[i] = artist_data(json_file)

tempvar = len(result) #For error tracking
#Drop rows of artists that are very small
result = result.drop(result[(result.TopTrackPlays < 20000)].index)
error_nonArtists = tempvar - len(result) #Error tracking of artists that are very small.
error_duplicatelinks = duplicate_external(result) #Amount of overlapping missing external links
tempvar = len(result)
result = result.drop(result[(result.Instagram == -1) | (result.Wikipedia == -1)].index) #Drops missing external links
error_missingExternal = tempvar - len(result)
result = result.reset_index() #Reset index
del result['index'] #Delete excess index column
instagram_list = list(result.Instagram) #Extract instagram links
print(f'Number of artists pulled: {id_len}')
print(f'Number of duplicate artists: {error_duplicates}')
print(f'Number of API pulls resulting in error: {error_warnings}')
print(f'Number of artists with less than 1000 monthly listeners dropped: {error_nonArtists}')
print(f'Number of missing external links: {error_missingExternal} ({error_duplicatelinks} overlapping Wikipedia and Instagram missing links)')
print(f'Resulting amount of artists: {id_len - error_warnings - error_nonArtists - (id_len-len(json_list)) - error_missingExternal}')
result.to_csv('API.result.csv', index = False)
result

# Section 4: Wikipedia Scraper

In [None]:
from bs4 import BeautifulSoup
import re
import pandas as pd

wiki_list = pd.read_csv(r'API.result.csv')
wiki_list = wiki_list['Wikipedia']

# Scraping the infobox of wiki - Maybe this should be defined as a function

def wikiscrape(wiki_list):
    #Objective: Scrap each wikipage in wiki_list for genres and origin, calculate the amount of years, that
    #the artist has been active and return dataframe with these values.
    df = pd.DataFrame(columns=[0,1,2,3]) #Empty dataframe

    data = [] #outside loop, empty list for each artist that we concatenate on the resulting list

    for i,x in enumerate(wiki_list):
        try:
            infoboxes = pd.read_html(x, index_col=0, attrs={"class":"infobox"})
        except: #If no infobox present in wikipage, skip this iteration
            continue

        data = [x] #Set first element of list to wikipedia link
        #In the infobox, info about origin is either labeled as 'Origin' or 'Born'
        try: #Tries Origin, this is the expression when looking up a band
            test = infoboxes[0].xs('Origin').values[0]
            test = test.split("[")[0] #Remove suffix
            test = re.sub(r'^.+, ([^/]+)$', r'\1', test) #Takes everything efter ', ' to get only country
            data.append(test) #Appends origin values
        except: #Tries born, this is the expression when looking up a single artist
            try: 
                test = infoboxes[0].xs('Born').values[0]
                test = re.sub(r'^.+\)([^/]+)$', r'\1', test) #Takes everything after ')'
                test = re.sub(r'^.+\d([^/]+)$', r'\1', test) #Takes everything after the last number
                test = re.sub(r'^.+]([^/]+)$', r'\1', test) #Takes everything after ']'
                test = test.split("[")[0] #Remove suffix
                test = re.sub(r'^.+, ([^/]+)$', r'\1', test) #Takes everything after ', ' to get only country
                data.append(test) #Appends born values
            except:
                data.append(-1) 
        try: #This part of the code secures that we only take the first genre mentioned on Wiki for each artist
            #test = infoboxes[0].xs('Genres').values[0]
            #data.append(test)
            response = requests.get(x, headers={'name':'Albert Wiborg','email':'ptd207@alumni.ku.dk'})
            soup = BeautifulSoup(response.content, 'lxml')
            for row in soup.table.tbody: #Loops over the rows in the table on the wikipage
                if 'Genres' in row.text: #If a row contains 'Genres' in the text
                    genres = row.find_all("li") #Sets list equal to all genres in the row
                    if len(genres) == 0: #If no genres was found (could be due to genres being links)
                        genres = row.find_all("a") #We set genres equal to the links in the row
                    genres = genres[0].text #We only want the first genre
                    genres = genres.split(',')[0] #Additional argument to remove subsequent genres
                    genres = genres.strip('[]') #Remove source-suffixes
            data.append(genres)
        except: 
            data.append(-1)
        try:
            test = str(infoboxes[0].xs('Years active').values[0])
            data.append(test)
        except:
            data.append(-1)
        data = pd.DataFrame(data)
        data = data.transpose() #Converting it from a long to a wide dataset
        df = pd.concat([df,data])
        print(f'try: {i+1} out of {len(wiki_list)}')
    return df
df = wikiscrape(wiki_list)
df.to_csv('Wiki_data_Raw.csv')

In [None]:
import numpy as np

df = pd.read_csv(r'Wiki_data_Raw.csv')
df = df.iloc[:,1:]
df = df.rename({'0': 'Wikipedia', '1': 'Origin', '2' : 'Genre' , '3': 'Years_Active' }, axis='columns')

# Calculating the active years as the difference between 2022 and the first year of the career
for i in range(len(df)):
    df.iloc[i][3] = re.sub("^\\D+(\\d)", "\\1", str(df.iloc[i][3])) # Removing everything until the first digit of the string

df['Years_Active'] = pd.to_numeric(df['Years_Active'].str[:4])
df = df.drop(df[(df['Years_Active'] == '1') | (df['Years_Active'] == 1)].index)    
df['Years_Active'] = np.where((df['Years_Active'] != 'NaN'), 2022 - df['Years_Active'], -1)

# Cleaning the genre column from references placed on wiki
df['Genre'] = df['Genre'].str.split('[').str[0]
#Making everything lowercase
df['Genre'] = df['Genre'].str.lower()

df['Genre'] = df['Genre'].map(str) # Converting to a string to use the find opperation

def check_genre(genre):   
## The purpose of this function is to classify genres from wiki into a more general framework
## Genre catagories: Hip hop, Pop, R&B, Rock, Country, EDM, Funk, Dance music
## If the orriginal genre contains any of these words in the main genre list, the genre will be replaced with the word from main genre list
    main_genres = ['hip hop', 'pop', 'r&b', 'rock', 'country', 'electronic music', 'dance music', 'folk']
    for word in main_genres:
        if genre.find(word) != -1:
            return word

def check_genre_manually(genre):

## The purpose of this function is to classify genres from wiki into a more general framework
## Genre catagories: Hip hop, Pop, R&B, Rock, Country, EDM, Funk, Dance music
    
    if ('rap' in genre) | (genre == 'trap') | (genre == 'sampledelia') | (genre == 'hiplife') | (genre == 'freestyle') | \
         ('hip-hop' in genre) | (genre == 'nerdcore') | (genre == 'igbo highlife') | (genre == 'trip hop') | (genre == 'grime') |\
         (genre == 'urbano music'): # Defining rap as Hip Hop
        return 'hip hop'


    elif (genre == 'reggaeton') | (genre == 'reggaetón') | (genre == 'reggae') | (genre == 'reggae fusion') | (genre == 'roots reggae') |\
         (genre == 'bachata') | ('eurodance' in genre) | ('dance' in genre) | (genre == 'disco') | ('salsa' in genre) | (genre == 'funk') |\
         (genre == 'urbano') | ('afrobeat' in genre) | (genre == 'electro') | (genre == 'ballad') | (genre == 'son') | (genre == 'idm') |\
         (genre == 'nu-disco') | (genre == 'kizomba') | (genre == 'duranguense') | (genre =='son cubano'):
         return 'dance music'
    
    
    elif ('metal' in genre) | ('punk' in genre) | (genre == "black 'n' roll") | (genre == 'screamo') | \
         (genre == 'melodic hardcore') | (genre == 'indie folk') | ( 'grunge' in genre) | (genre == 'post-hardcore') | \
         (genre == 'emo') | (genre == 'alternative') | (genre == 'deathcore') | (genre == 'crossover thrash') | \
         (genre == 'dark ambient') | (genre == 'grupero') | (genre == 'slowcore') | (genre == 'powerviolence') | (genre == 'vada vada') |\
         (genre == 'indie'): # Defining metal as Rock
        return 'rock'
    

    elif ('soul' in genre) | (genre == 'tejano') | ('jazz' in genre) | ('blues' in genre) | (genre == 'stride') | (genre == 'ska') |\
         (genre == 'latin ballad') | (genre == 'hard bop') | (genre == '2 tone'):
        return 'r&b'
    
    elif (genre == 'techno') | (genre == 'edm') | ('house' in genre) | (genre == 'future bass') | \
         (genre == 'progressive trance') | (genre == 'dubstep') | ('trance' in genre) | (genre == 'hardstyle') | \
         (genre == 'ebm') | (genre == 'glitch') | (genre == 'drum and bass') | (genre == 'big beat') | \
         ('ambient' in genre) | (genre == 'aggrotech') | (genre == 'experimental') | (genre == 'future funk') | \
         (genre == 'tribal-guarachero') | (genre == 'kwaito') | (genre == 'banku') | (genre == 'indietronica') | \
         (genre == 'indietronica') | (genre == 'new-age') | (genre == 'synthwave') | (genre == 'video game music') |\
         (genre == 'glitch hop') | ('electronic' in genre) | (genre == 'amapiano') | (genre == 'gqom') | (genre == 'instrumental'):
        return 'electronic music'
        

    
    elif (genre == 'americana') |  (genre == 'gulf and western') | (genre == 'red dirt') | (genre == 'old-time'):
        return 'country'
    
    
    elif (genre == 'ethiopian music') | (genre == 'vocal') | ('minimal' in genre) | (genre == 'new wave') | \
         (genre == 'novelty') | (genre == 'chillwave') | (genre == 'acoustic') | (genre == 'ethereal wave') | \
         (genre == 'avant-garde') | (genre == 'playback singing') | (genre == "children's") | (genre == 'disney music') | \
         (genre == 'latin') | (genre == "children's music") :
        return 'pop'

    
    elif (genre == 'musical theatre') | (genre == 'broadway') | ('film' in genre) | (genre == 'bollywood') | \
         ('contemporary' in genre) | ('worship' in genre) | (genre == 'ccm') | ('gospel' in genre) | \
         (genre == 'vallenato') | (genre == 'punjabi') | (genre == 'modern laika') | (genre == 'qawwali') | \
         (genre == 'qawwali') | (genre == 'norteño') | (genre == 'sea shanties') | (genre == 'world') | \
         (genre == 'opm') | (genre == 'a cappella') | (genre == 'opera') | (genre == 'classical') | \
         (genre == 'classical crossover') | (genre == 'indian classical music') | ('irish' in genre) | (genre == 'soca') |\
         (genre == 'new flamenco') | (genre == 'fado') | (genre == 'roots') | (genre == 'canadiana') | (genre == 'bhangra') |\
         (genre == 'joik') | (genre == 'sufi'):
        return 'folk'

    
    else:
        return check_genre(genre)

df['Main.Genre'] = df['Genre'].apply(check_genre_manually)
df['Main.Genre'] = df['Main.Genre'].fillna(value = df['Genre'])

# Dropping nan values
df = df.drop(df[(df['Main.Genre'] == 'nan') | (df['Main.Genre'] == '') | (df['Main.Genre']=='-1')].index)
# Oveview of the genre distribution after manipulation
y = df['Main.Genre']
y2 = y.value_counts()
for ind,val in y2.iteritems():
    print(ind,val)

# Cleaning Origin column

# Replacing american states with "United States"
state_names=["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]

for i in df['Origin']:
    for x in state_names:
        if x in i:
            df['Origin'] = np.where((df['Origin'] == i), 'United States', df['Origin'])

# Replace U.S. with united states
df['Origin'] = np.where(((df['Origin']== 'Brigham Young University')  | (df['Origin'] == 'US') | (df['Origin']== 'U.S.') |\
(df['Origin'] == "U.S. JIDEducationStephenson High SchoolHampton University (no degree") | ( df['Origin'] == 'U.S') |\
(df['Origin'] == 'USA') | (df['Origin'] == 'CA') | (df['Origin'] == 'Los Angeles') | (df['Origin'] == 'S.D.') |\
(df['Origin'] == 'OK') | (df['Origin'] == 'IL') | (df['Origin'] == 'Howard University')) \
, 'United States', df['Origin'])

# If any of the contries is in the Origin column replace with country
country_names = ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua & Deps', 'Argentina', 'Armenia', 'Australia', 'Austria',\
'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia',\
'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', \
'Cape Verde', 'Central African Rep', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Congo', 'Costa Rica', 'Croatia',\
'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor', 'Ecuador', 'England', 'Egypt',\
'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', \
'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland',\
'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan',\
'Kenya', 'Kiribati', 'Korea North', 'Korea South', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho',\
'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\
'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', \
'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua',\
'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', \
'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'St Kitts & Nevis', 'St Lucia', 'Saint Vincent & the Grenadines',\
'Samoa', 'San Marino', 'Sao Tome & Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore',\
'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', \
'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Togo', 'Tonga', 'Trinidad & Tobago',\
'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States',\
'Uruguay', 'Uzbekistan', 'Vanuatu', 'Vatican City', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe']

for i in df['Origin']:
    for x in country_names:
        if x in i:
            df['Origin'] = np.where((df['Origin'] == i), x, df['Origin'])

            
# Replace Punjab with india
df['Origin'] = np.where((df['Origin'] == 'Punjab') | (df['Origin'] == 'Kerala'), 'India', df['Origin'])
# Replace England, London with United Kingdom
df['Origin'] = np.where((df['Origin'] == 'England') | (df['Origin'] == 'London') | (df['Origin'] == 'Pembury') | (df['Origin'] == 'UK') | (df['Origin'] == 'Wales') | (df['Origin'] == 'Wiltshire'), 'United Kingdom', df['Origin'])
# Replace Soviet union with rusia
df['Origin'] = np.where( (df['Origin'] == 'Soviet Union'), 'Russia', df['Origin'])
df['Origin'] = np.where( (df['Origin'] == 'Rusia'), 'Russia', df['Origin'])
# Replacing for Australia
df['Origin'] = np.where( (df['Origin'] == "St Kevin's CollegeMonash University"), 'Australia', df['Origin'])
# Replacing all with Yugoslavia which will be dropped afterwards
df['Origin'] = np.where( (df['Origin'] == "FPR Yugoslavia") | (df['Origin'] == "SFR Yugoslavia"), 'Yugoslavia', df['Origin'])
df['Origin'] = np.where(df['Origin'] == 'Yugoslavia', '-1', df['Origin'])
# Replacing Toronto with canada
df['Origin'] = np.where((df['Origin'] == 'Toronto') | (df['Origin'] == 'AB') | (df['Origin'] == 'British Columbia'), 'Canada', df['Origin'])

# Dropping relevant rows such as empty and rows labeled with -1
df = df.drop(df[df['Origin'] == '-1'].index)
df = df.drop(df[df['Origin'] == 'Ziwerekoru Fumudoh'].index)
df = df.drop(df[df['Origin'] == ''].index)
#df.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1, inplace=True)
df['Origin'] = df['Origin'].str.strip('()[]')
df = df.drop(df[(df['Origin'] == 'nan') | (df['Origin'] == '')].index)
df

In [None]:
#Create dummies for english and spanish speaking countries
df['dum_eng'] = np.where((df['Origin'] == 'United States') | (df['Origin'] == 'United Kingdom') | (df['Origin'] == 'Australia') | (df['Origin'] == 'Canada') | (df['Origin'] == 'New Zealand'), 1, 0)
df['dum_spa'] = np.where((df['Origin'] == 'Spain') | (df['Origin'] == 'Argentina') | (df['Origin'] == 'Bolivia') | (df['Origin'] == 'Chile') | (df['Origin'] == 'Colombia') | (df['Origin'] == 'Costa Rica') | (df['Origin'] == 'Cuba') \
| (df['Origin'] == 'Dominican Republic') | (df['Origin'] == 'Ecuador') | (df['Origin'] == 'El Salvador') | (df['Origin'] == 'Equatorial Guinea')| (df['Origin'] == 'Guatemala') \
| (df['Origin'] == 'Honduras') | (df['Origin'] == 'Mexico') | (df['Origin'] == 'Nicaragua') | (df['Origin'] == 'Panama')\
| (df['Origin'] == 'Paraguay') | (df['Origin'] == 'Peru') | (df['Origin'] == 'Puerto Rico') | (df['Origin'] == 'Uruguay')\
| (df['Origin'] == 'Venezuela'), 1, 0)
#Create dummies for country and genres for use in machine learning
df['dum_country'] = np.where(df['Main.Genre'] == 'country', 1, 0)
df['dum_dance'] = np.where(df['Main.Genre'] == 'dance music', 1, 0)
df['dum_electronic'] = np.where(df['Main.Genre'] == 'electronic music', 1, 0)
df['dum_folk'] = np.where(df['Main.Genre'] == 'folk', 1, 0)
df['dum_hiphop'] = np.where(df['Main.Genre'] == 'hip hop', 1, 0)
df['dum_r&b'] = np.where(df['Main.Genre'] == 'r&b', 1, 0)
df['dum_r&b'] = np.where(df['Main.Genre'] == 'rock', 1, 0)
df.to_csv(r'Wikipedia.result.csv')
df

# Section 4: Instagram Scraper

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

url =  'https://www.instagram.com'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)

time.sleep(3)
# Accepting cookies
cookie = driver.find_element(By.CSS_SELECTOR, '.bIiDR')
cookie.click()

time.sleep(3)
# Logging into instagram
username = 'oscarkarlsson0804'
code = 'isds123!'

username_input = driver.find_element(By.CSS_SELECTOR, "input[name='username']")
password_input = driver.find_element(By.CSS_SELECTOR, "input[name='password']")

username_input.send_keys(username)
password_input.send_keys(code)

login_button = driver.find_element(By.XPATH, "//button[@type='submit']")
login_button.click()

time.sleep(5)
safe_login = driver.find_element(By.CSS_SELECTOR, '.y3zKF')
safe_login.click()

time.sleep(5)
noti_no = driver.find_element(By.CSS_SELECTOR, '._a9_1')
noti_no.click()

df = pd.read_csv('API.result.csv')
insta_list = df['Instagram']
insta_list = pd.DataFrame(insta_list)
insta_list = insta_list.drop(insta_list[insta_list.Instagram == '-1'].index)

follower_list=[]
col = ['Instagram', 'I_Followers']
follower_df=pd.DataFrame()
import re
for i in insta_list['Instagram']:
    try:
        driver.get(i)
        time.sleep(5)
        soup2 = BeautifulSoup(driver.page_source, 'lxml')
        followers = soup2.find_all('div', class_ = '_aacl _aacp _aacu _aacx _aad6 _aade')
        followers = str(followers[1])
    except:
        follower_list.append(-1)
        continue

    followers = followers.replace(',','')
    All_digits = re.findall(r'\d+', followers)
    All_digits = All_digits[2]
    follower_list.append(All_digits)

insta_list_df = insta_list.reset_index(drop=True)
insta_list_df['I_Followers']=follower_list

insta_list_df.to_csv('Instagram.result.csv')

# Section 4: File merge

In [None]:
#This section merges the different DataFrames that we have created in the 3 other sections
final=pd.read_csv('API.result.csv')
Instagram_df=pd.read_csv('Instagram.result.csv')
Wiki_df=pd.read_csv('Wikipedia.result.csv')
final=final.merge(Instagram_df, how='left', on='Instagram' )
final=final.merge(Wiki_df, how='left', on='Wikipedia' )

#Drop irrelevant columns
final = final.drop(columns='Unnamed: 0_y')
final = final.dropna()
final = final.drop(final[final.I_Followers == -1].index)
final = final.drop(final[final.Origin == -1].index)
final = final.drop(final[final.Genre == -1].index)
final = final.drop(final[final.Years_Active == -1].index)
final = final.drop(final[final.EarliestAlbum == -1].index)
final = final.drop_duplicates(subset = 'ID', keep = 'first', ignore_index = True)
final = final.reset_index()

final.to_csv('final.csv')
final

# Country-map

In [56]:
df_wiki = pd.read_csv('Wikipedia.result.csv')
del df_wiki['Unnamed: 0']

countries = []
counts = []
#Since value_counts returns a series, we use iteritems() so we can access these with our variables val and ind
for val, ind in df_wiki['Origin'].value_counts().iteritems():
    countries.append(val)
    counts.append(ind)

def country_genre(df2):
    #Objective: Get the most common genre from each country
    #Input: Dataframe (Wiki_data converted to DataFrame in particular)
    df = df2.copy() #Copy DataFrame so that we don't overwrite the input DataFrame outside of this function
    genres = [] #List to contain genres
    unique_countries = df_wiki["Origin"].nunique() #Set equal to amount of unique values in Origin column
    
    for i in range(unique_countries):
        #Groupby groups countries by Main Genre. We then count the values of each genre by country
        #and set s equal to the maximum of all countries and genres, fx Rock USA
        s = df.groupby('Main.Genre')['Origin'].value_counts().idxmax() 
        genre = s[0] #s is a tuple, with the genre in index = 0
        country = s[1] #and country in index = 1
        genres.append(genre) #append genre from tuple to list of genres which we want to be returned
        df.drop(df.loc[df['Origin'] == country].index, inplace = True) #we now drop the country whose top genre we just extracted and repeat the process
    
    return genres

#Make new DataFrame with data of amount of artists from each country and corresponding most popular genres in the countries 
country_map = pd.DataFrame({"Country": countries, "Count": counts, "Most popular Genre": country_genre(df_wiki)})
country_map.to_csv("country_map.csv", index=False)

# Country map plot

In [None]:
import plotly.express as px #download plotly package
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import kaleido #pip install kaleido
import os
import geopandas as gpd #pip install geopandas

final=pd.read_csv('final.csv')
# importing our data set of countries/origin, most frequent genre and number of observation for each Origin
df=pd.read_csv('country_map.csv')
df['Country'] = np.where(df['Country'] == 'United States', 'United States of America', df['Country'])
df['Country'] = np.where(df['Country'] == 'Czech Republic', 'Czechia', df['Country'])
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'electronic music', 'EDM', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'dance music', 'Dance', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'pop', 'Pop', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'r&b', 'R&B', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'rock', 'Rock', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'folk', 'Folk', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'country', 'Country', df['Most popular Genre'] )
df['Most popular Genre'] = np.where(df['Most popular Genre'] == 'hip hop', 'Rap', df['Most popular Genre'] )

# Importing data set with geo location data
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

final=world.merge(df, how='inner', right_on='Country', left_on = 'name' )
final['Count2'] = np.log(final['Count'])
final = final.sort_values(by = 'Count',  ascending=False)
fig, ax = plt.subplots(figsize=(17, 8))
final.plot(column = "Count2",ax=ax, legend=False, cmap='Greens')
plt.axis('off')
final.apply(lambda x: ax.annotate(text=x['Most popular Genre'], xy=x.geometry.centroid.coords[0], horizontalalignment='center', fontsize = 6), axis=1)
plt.show()

# Plots of genres

In [None]:
import plotly.express as px #download plotly package using: $ pip install plotly==5.10.0
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_csv('final.csv', index_col=0)

removals = df['Origin'].value_counts().reset_index()
removals = removals[removals['Origin'] > 5]['index'].values
origin_df = df[df['Origin'].isin(removals)]
df['Origin'].value_counts()
fig9 = px.histogram(origin_df, x="Main.Genre", color="Origin", labels={'Main.Genre':'Main genres'} # color of histogram bars
                   )
fig9.update_layout(width=1000, height=700, bargap=0.05)

In [None]:
RandB = df[df['Main.Genre']=='r&b']
DanceMusic = df[df['Main.Genre']=='dance music']
Pop = df[df['Main.Genre']=='pop']
Rock = df[df['Main.Genre']=='rock']
Folk = df[df['Main.Genre']=='folk']
Country = df[df['Main.Genre']=='country']
Electronic = df[df['Main.Genre']=='electronic']
HipHip = df[df['Main.Genre']=='hip hop']

df = df[df.Albums < 80]
df = df[df.MonthlyListeners < 60000000]
df = df[df.Years_Active != 2021]

fig19 = px.histogram(df, x="Main.Genre", 
                   histnorm='percent',
                  color_discrete_sequence=['seagreen'],
                    nbins=40, text_auto=True)

fig19.update_layout(xaxis_title="Main Genres", yaxis_title="Pct. of the data set")
fig19.update_layout(width=1300, height=800, bargap=0.05)
fig19.show()

In [None]:
fig19 = px.histogram(df, x="MonthlyListeners", 
                   histnorm='percent',
                  color_discrete_sequence=['seagreen'],
                    nbins=50, text_auto=True)

fig19.update_layout(xaxis_title="MonthlyListeners", yaxis_title="Pct. of the data set")
fig19.update_layout(width=1300, height=800, bargap=0.05)

In [None]:
#Figure 3 plots

fig1 = px.scatter(df, x='TopTrackPlays', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig1.show()

fig2 = px.scatter(df, x='AppearsOn', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig2.show()

fig3 = px.scatter(df, x='MonthSinceRelease', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig3.show()

fig4 = px.scatter(df, x='Albums', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig4.show()

fig5 = px.scatter(df, x='I_Followers', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig5.show()

fig6 = px.scatter(df, x='Years_Active', y='MonthlyListeners', trendline="ols", color_discrete_sequence=["green"])
fig6.show()

In [None]:
fig16 = px.scatter(df, x='TopTrackPlays', y='MonthlyListeners', trendline="ols", facet_col='Main.Genre', facet_col_wrap=9, facet_row_spacing=0.062500, color_discrete_sequence=["green"])
fig16.update_layout(width=1300, height=500, bargap=0.05)
fig16.show()

In [None]:
#Figure 4 plots

UnitedStates  = origin_df[origin_df['Origin']=='United States']
UnitedKingdom  = origin_df[origin_df['Origin']=='United Kingdom']
Australia = origin_df[origin_df['Origin']=='Australia']

fig6 = px.scatter_polar(UnitedStates,theta='Main.Genre', r='MonthlyListeners', color = 'Origin')
fig6.update_layout(width=800, height=800, bargap=0.05)
fig6.show()

fig7 = px.scatter_polar(UnitedKingdom,theta='Main.Genre', r='MonthlyListeners', color = 'Origin')
fig7.update_layout(width=800, height=800, bargap=0.05)
fig7.show()

fig8 = px.scatter_polar(Australia,theta='Main.Genre', r='MonthlyListeners', color = 'Origin')
fig8.update_layout(width=800, height=800, bargap=0.05)
fig8.show()

# Section 6: Machine learning

In [None]:
# Importing relevant packages
import math
import matplotlib #install matplotlimb
import matplotlib.pyplot as plt

# LOAD FROM SCIKIT-LEARN
from sklearn.linear_model import LogisticRegression #install scikit-learn
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error as mse

DATA SPLIT

In [59]:
final = pd.read_csv('final.csv')
final = final.drop(final[(final['I_Followers'] == 0) & (final['I_Followers'] == -1)].index)
final = final.drop(final[(final['MonthlyListeners'] > 20000000)].index)
final['log_I_Followers']=np.log(final['I_Followers'])
final=final.sample(frac=1) #shuffle

In [60]:
# DEFINE FEATURES AND TARGET
X = np.array(final[['SpotifyFollowers',
                    'TopTrackPlays',
                    'AppearsOn',
                    'log_I_Followers',
                    'dum_eng',
                    'dum_spa',
                    'dum_country',
                    'dum_dance',
                    'dum_electronic',
                    'dum_folk',
                    'dum_hiphop',
                    'dum_r&b'
                   ]]) # features

y = np.array(final['MonthlyListeners']) # target

# SPLIT INTO DEVELOPMENT (2/3) AND TEST DATA (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=1/3, random_state=14)

# SPLIT DEVELOPMENT INTO TRAIN (1/3) AND VALIDATION (1/3)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=1/2, random_state=14)

#### Model 1: LASSO

In [62]:
perform = [] # Store performance
lambdas = np.logspace(-4, 4, 50) # Grid of lambdas

from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10)
folds = list(kfolds.split(X_dev, y_dev))

# Outer loop: lambdas
mseCV = []
for lambda_ in lambdas:

    # Inner loop: folds
    mseCV_ = []
    for train_idx, val_idx in folds:

        # Train model and compute MSE on test fold
        pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=2, include_bias=True),
                                     StandardScaler(),
                                     Lasso(alpha=lambda_, random_state=14))
        X_train, y_train = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx]
        pipe_lassoCV.fit(X_train, y_train)
        mseCV_.append(mse(pipe_lassoCV.predict(X_val), y_val))

    # Store result
    mseCV.append(mseCV_)

# Convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambdas)


# CHOOSE OPTIMAL HYPERPARAMETERS (mean of MSE's across folds)
optimal_lambda = lambdaCV.mean(axis=1).nsmallest(1)

# RETRAIN/RE-ESTIMATE MODEL USING OPTIMAL HYPERPARAMETERS AND COMPARE PERFORMANCE
pipe_lassoCV = make_pipeline(PolynomialFeatures(include_bias=False),
                             StandardScaler(),
                             Lasso(alpha=optimal_lambda.index[0], random_state=14))

pipe_lassoCV.fit(X_dev,y_dev) #fit optimal lambda to entire development set: likely to improve performance slightly since we use more oberservations

lambda_lasso=optimal_lambda.index[0]


In [63]:

# Outer loop: lambdas
mseCV = []
for lambda_ in lambdas:

    # Inner loop: folds
    mseCV_ = []
    for train_idx, val_idx in folds:

        # Train model and compute MSE on test fold
        pipe_lassoCV_3 = make_pipeline(PolynomialFeatures(degree=3, include_bias=True),
                                     StandardScaler(),
                                     Lasso(alpha=lambda_, random_state=14))
        X_train, y_train = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx]
        pipe_lassoCV_3.fit(X_train, y_train)
        mseCV_.append(mse(pipe_lassoCV_3.predict(X_val), y_val))

    # Store result
    mseCV.append(mseCV_)

# Convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambdas)


# CHOOSE OPTIMAL HYPERPARAMETERS (mean of MSE's across folds)
optimal_lambda = lambdaCV.mean(axis=1).nsmallest(1)

# RETRAIN/RE-ESTIMATE MODEL USING OPTIMAL HYPERPARAMETERS AND COMPARE PERFORMANCE
pipe_lassoCV_3 = make_pipeline(PolynomialFeatures(include_bias=False),
                             StandardScaler(),
                             Lasso(alpha=optimal_lambda.index[0], random_state=14))

pipe_lassoCV_3.fit(X_dev,y_dev) #fit optimal lambda to entire development set: likely to improve performance slightly since we use more oberservations

lambda_lasso_3=optimal_lambda.index[0]

#### Model 2: RIDGE w. cross validation

In [64]:
from sklearn.linear_model import Ridge
# Outer loop: lambdas
mseCV = []
for lambda_ in lambdas:

    # Inner loop: folds
    mseCV_ = []
    for train_idx, val_idx in folds:

        # Train model and compute MSE on test fold
        pipe_ridgeCV = make_pipeline(PolynomialFeatures(degree=2, include_bias=True),
                                     StandardScaler(),
                                     Ridge(alpha=lambda_, random_state=14))
        X_train, y_train = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx]
        pipe_ridgeCV.fit(X_train, y_train)
        mseCV_.append(mse(pipe_ridgeCV.predict(X_val), y_val))

    # Store result
    mseCV.append(mseCV_)

# Convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambdas)


# CHOOSE OPTIMAL HYPERPARAMETERS (mean of MSE's across folds)
optimal_lambda = lambdaCV.mean(axis=1).nsmallest(1)

# RETRAIN/RE-ESTIMATE MODEL USING OPTIMAL HYPERPARAMETERS AND COMPARE PERFORMANCE
pipe_ridgeCV = make_pipeline(PolynomialFeatures(include_bias=False),
                             StandardScaler(),
                             Ridge(alpha=optimal_lambda.index[0], random_state=14))

pipe_ridgeCV.fit(X_dev,y_dev) #fit optimal lambda to entire development set: likely to improve performance slightly since we use more oberservations

lambda_ridge=optimal_lambda.index[0]

In [65]:
mseCV = []
for lambda_ in lambdas:

    # Inner loop: folds
    mseCV_ = []
    for train_idx, val_idx in folds:

        # Train model and compute MSE on test fold
        pipe_ridgeCV_3 = make_pipeline(PolynomialFeatures(degree=3, include_bias=True),
                                     StandardScaler(),
                                     Ridge(alpha=lambda_, random_state=14))
        X_train, y_train = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx]
        pipe_ridgeCV_3.fit(X_train, y_train)
        mseCV_.append(mse(pipe_ridgeCV_3.predict(X_val), y_val))

    # Store result
    mseCV.append(mseCV_)

# Convert to DataFrame
lambdaCV_3 = pd.DataFrame(mseCV, index=lambdas)


# CHOOSE OPTIMAL HYPERPARAMETERS (mean of MSE's across folds)
optimal_lambda = lambdaCV_3.mean(axis=1).nsmallest(1)

# RETRAIN/RE-ESTIMATE MODEL USING OPTIMAL HYPERPARAMETERS AND COMPARE PERFORMANCE
pipe_ridgeCV = make_pipeline(PolynomialFeatures(include_bias=False),
                             StandardScaler(),
                             Ridge(alpha=optimal_lambda.index[0], random_state=14))

pipe_ridgeCV.fit(X_dev,y_dev) #fit optimal lambda to entire development set: likely to improve performance slightly since we use more oberservations

lambda_ridge_3=optimal_lambda.index[0]

In [None]:
#SCORE
models = { 'Lasso CV': pipe_lassoCV ,'Lasso CV 3': pipe_lassoCV_3 ,'Ridge CV': pipe_ridgeCV ,'Ridge CV 3': pipe_ridgeCV_3 }
#print('Optimal lambda:', optimal.index[0])

for name, model in models.items():
    score = math.sqrt(mse(model.predict(X_test),y_test))
    print(name, round(score, 2))
print()
print('Lasso CV Lambda:', lambda_lasso )
print('Lasso CV Lambda 3:', lambda_lasso_3 )
print('Ridge CV Lambda:', lambda_ridge )
print('Ridge CV Lambda 3:', lambda_ridge_3 )

#### Model 3: Elastic net

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

pipe_el = make_pipeline(PolynomialFeatures(degree=2,include_bias=False),
                        StandardScaler(),
                        ElasticNet(tol=0.001,random_state=14))

gs = GridSearchCV(estimator=pipe_el,
                  param_grid={'elasticnet__alpha':lambdas,
                              'elasticnet__l1_ratio':np.linspace(0,1,20)},
                  scoring='neg_mean_squared_error',
                  cv=10)

In [68]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

pipe_el_3 = make_pipeline(PolynomialFeatures(degree=3,include_bias=False),
                        StandardScaler(),
                        ElasticNet(tol=0.001,random_state=14))

gs_3 = GridSearchCV(estimator=pipe_el,
                  param_grid={'elasticnet__alpha':lambdas,
                              'elasticnet__l1_ratio':np.linspace(0,1,20)},
                  scoring='neg_mean_squared_error',
                  cv=10)

In [None]:
#SCORE
models = { 'Lasso CV': pipe_lassoCV ,'Lasso CV 3': pipe_lassoCV_3 ,'Ridge CV': pipe_ridgeCV ,'Ridge CV 3': pipe_ridgeCV_3 ,'Elastic Net':gs.fit(X_dev, y_dev),'Elastic Net 3':gs_3.fit(X_dev, y_dev) }


for name, model in models.items():
    score = math.sqrt(mse(model.predict(X_test),y_test))
    print(name, round(score, 2))
print()
print('Lasso CV Lambda:', lambda_lasso )
print('Ridge CV Lambda:', lambda_ridge)
print('CV Elastic Net params:', gs.best_params_)
print('Lasso CV 3 Lambda:', lambda_lasso_3 )
print('Ridge CV 3 Lambda:', lambda_ridge_3)
print('CV Elastic Net 3 params:', gs_3.best_params_)

#### Plots of models

In [70]:
y_pred=pipe_ridgeCV.predict(X_test)
#predictions
import plotly.express as px #download plotly package
import kaleido
import plotly.graph_objects as go
import os

p = {'y_test': y_test, 'y_pred': y_pred}
predictions=pd.DataFrame(data=p)
predictions

fig1 = px.scatter(predictions, x='y_test', y='y_pred')
fig2 = px.line(predictions, x="y_test", y="y_test") #y=x line added
fig1.add_trace(fig2.data[0])

fig1.write_image("Residual plot.png")

In [None]:
# LEANINGCURVE
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = \
    learning_curve(estimator=pipe_ridgeCV,
                   X=X_dev,
                   y=y_dev,
                   train_sizes=np.arange(0.05, 1.05, .05),
                   scoring='neg_mean_squared_error',
                   cv=10)

mse_ = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                     'Test':-test_scores.mean(axis=1)})\
        .set_index(pd.Index(train_sizes,name='sample size'))

mse_.head(5)

#PLot 1
f_learn, ax = plt.subplots(figsize=(12,6))
ax.plot(train_sizes,np.sqrt(-test_scores.mean(1)), alpha=0.25, linewidth=2, label ='Validation', color='blue') # negated, because we already use neg_MSE
ax.plot(train_sizes,np.sqrt(-train_scores.mean(1)), alpha=0.25, linewidth=2, label='Train', color='orange') # negated, because we already use neg_MSE

ax.set_title('Mean performance')
ax.set_ylabel('Root-Mean squared error')
ax.legend();

In [None]:
#Plot 2
f_learn, ax = plt.subplots(figsize=(12,7))
ax.fill_between(train_sizes, np.sqrt(-test_scores.min(1)), np.sqrt(-test_scores.max(1)), alpha=0.25, label ='Validation', color='blue')

ax.fill_between(train_sizes, np.sqrt(-train_scores.min(1)), np.sqrt(-train_scores.max(1)),  alpha=0.25, label='Train', color='orange')

ax.set_title('Range of performance (min, max)')
ax.set_ylabel('Root-Mean squared error')
ax.legend();
plt.savefig('Learning.png')

In [None]:
#Change param_name and estimator!

#VALIDATION CURVE
# LOAD FROM SCIKIT-LEARN
from sklearn.model_selection import validation_curve

# FIT AND EVALUATE FOR DIFFERENT LAMBDAS
train_scores, test_scores = \
    validation_curve(estimator=pipe_ridgeCV,
                     X=X_dev,
                     y=y_dev,
                     param_name='ridge__alpha',
                     param_range=lambdas, #values to consider
                     scoring='neg_mean_squared_error',
                     cv=10)

# OBTAIN MSE FOR DIFFERENT LAMBDAS AND PRINT BEST
mse_score = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'lambda':lambdas})\
              .set_index('lambda')
print(mse_score.Validation.nsmallest(1))

np.sqrt(mse_score).plot(logx=True, figsize=(12,7));

plt.savefig('Validation.png')