### Web Scraping

_______________________

#### In this notebook, we will be scraping two individual lists:
1. The current Top 100 songs.
2. The Top 100 songs in rock and pop from the 50s until present.

##### This will be done by using beautiful soup, and the resulting dataframe will serve as our top 200 hundred hot songs

_____________________

#### 1. Import Libraries

In [73]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import random
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA    
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
import numpy as np

from random import randint
from time import sleep

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#### 2. Scraping Top 100 songs

In [8]:
#1. Attribute url
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [9]:
# 2. Download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

######### '200' response, so we will be able to scrape the page via beatifulsoup

In [10]:
# 3. Parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
#soup

In [11]:
# 4. Divide the different criteria into a different lists to be put into our final dataframe

artist = []
song = []
genre = []
year = []

#num_iter = len("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item")

songart = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item")
genlist = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item > div.chart-content")
yearlist = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item > div.chart-content > ul > li:nth-child(2)")

for i in range(100):
    artist.append(songart[i].em.get_text())
    song.append(songart[i].cite.get_text())
    year.append(yearlist[i].get_text())    
    try:
        genre.append(genlist[i].ul.li.a.get_text())
    except:
        genre.append('Unknown')

IndexError: list index out of range

In [12]:
len(soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item"))

96

In [13]:
# 5. Attribute the lists to a dataframe

top100 = pd.DataFrame({'artist':artist
                    ,'track':song
                    ,'genre':genre
                    ,'year':year})

In [14]:
# 6. Lower case all the values
top100 = top100.apply(lambda top100: top100.str.lower() if(top100.dtype == 'object') else top100)
top100

Unnamed: 0,artist,track,genre,year
0,oliver anthony music,rich men north of richmond,country,"release date: august 11, 2023"
1,jimmy buffett,margaritaville,unknown,"release date: january 1, 1977"
2,zach bryan,i remember everything (feat. kacey musgraves),country,"release date: august 24, 2023"
3,jimmy buffett,come monday,unknown,"release date: january 1, 1974"
4,doja cat,paint the town red,unknown,"release date: august 4, 2023"
...,...,...,...,...
91,chris tomlin,holy forever,christian & gospel,"release date: july 29, 2022"
92,jimmy buffett & martina mcbride,trip around the sun,country,"release date: july 13, 2004"
93,fleetwood mac,everywhere,rock,"release date: april 14, 1987"
94,kane brown & katelyn brown,thank god,country,"release date: september 9, 2022"


In [15]:
# 7. In the year column we have a Genre: leak. We will split these genres 

def genresplit(value):
    if 'Genre:' in value:
        name = value.split('Genre: ')[1]
        
        return name
    else:
        return value

In [16]:
# 8. .. and move these genres 

top100['genre'] = top100['genre'].apply(genresplit)
top100['year'] = top100['year'].apply(genresplit)

In [17]:
def tidy(value):

    type1 = value['genre']
    type2 = value['year']
    if str(type1) == 'New Release':
        return type2
    else:
        return type1

    
top100['genre'] = top100.apply(tidy,axis=1)

In [18]:
# 9. Now we are going to split the year column by the comma and find the year value. "Nan" values filled with 2023, because it was originally a "New Release" value

top100['year'] = top100['year'].str.split(", ", n = 1, expand = True)[1]
top100['year'] = top100['year'].fillna(2023)
top100['track'] = top100['track'].str.strip()

In [19]:
top100

Unnamed: 0,artist,track,genre,year
0,oliver anthony music,rich men north of richmond,country,2023
1,jimmy buffett,margaritaville,unknown,1977
2,zach bryan,i remember everything (feat. kacey musgraves),country,2023
3,jimmy buffett,come monday,unknown,1974
4,doja cat,paint the town red,unknown,2023
...,...,...,...,...
91,chris tomlin,holy forever,christian & gospel,2022
92,jimmy buffett & martina mcbride,trip around the sun,country,2004
93,fleetwood mac,everywhere,rock,1987
94,kane brown & katelyn brown,thank god,country,2022


In [20]:
# 10. Export to csv

top100.to_csv('top100_from_PopVortex.csv', index = False)