# Web Scraping a la página web de la RIAA (Recording Industry Association of America)

URL: "https://www.riaa.com/gold-platinum/?tab_active=awards_by_artist#search_section"

Se realiza web scraping con Selenium y se extrae un dataframe con los mejores artistas según la RIAA en base a los discos de oro/platino.

In [1]:
# Para la manipulación de datos
import pandas as pd

# Servicio y driver de Chrome de Selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Buscar elementos
from selenium.webdriver.common.by import By

# Para "clickear"
from selenium.webdriver.common.keys import Keys

# Tiempo de retardo para no parecer un bot
import time

In [10]:
# Instalación webdriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [11]:
riaa_url = "https://www.riaa.com/gold-platinum/?tab_active=awards_by_artist#search_section"
driver.get(riaa_url)

In [12]:
button = driver.find_element(By.CLASS_NAME, "link-arrow-gnp")
for i in range(0, 4):
    button.click()
    time.sleep(5)

In [13]:
data = []

artist_name = []
certifications = []
gold = []
platinum = []
multi_platinum = []
diamond = []

rows = driver.find_elements(By.CLASS_NAME, "meta_award_data")
for row in rows:
    
    name = row.find_element(By.TAG_NAME, "h2").text
    artist_name.append(name)

    h4 = row.find_elements(By.TAG_NAME, "h4")

    for j in h4:
        
        if "Cert. Units" in j.text:
            certs = j.text.split(":")[1]
            certifications.append(certs)
        elif "Gold Units" in j.text:
            oro = j.text.split(":")[1]
            gold.append(oro)
        elif "Multi Platinum Units" in j.text:
            mp = j.text.split(":")[1]
            multi_platinum.append(mp)
        elif "Platinum Units" in j.text:
            plat = j.text.split(":")[1]
            platinum.append(plat)
        elif "Diamond Units" in j.text:
            diam = j.text.split(":")[1]
            diamond.append(diam)

artist_data = {
        "artist_name": artist_name,
        "certifications": certifications,
        "gold_albums": gold,
        "platinum_albums": platinum,
        "multiplatinum_albums": multi_platinum,
        "diamond_albums": diamond
    }


df = pd.DataFrame(artist_data)


In [14]:
print(artist_data)

{'artist_name': ['THE BEATLES', 'GARTH BROOKS', 'ELVIS PRESLEY', 'EAGLES', 'LED ZEPPELIN', 'MICHAEL JACKSON', 'BILLY JOEL', 'ELTON JOHN', 'AC/DC', 'PINK FLOYD', 'MARIAH CAREY', 'BRUCE SPRINGSTEEN', 'AEROSMITH', 'GEORGE STRAIT', 'BARBRA STREISAND', 'THE ROLLING STONES', 'MADONNA', 'METALLICA', 'EMINEM', 'WHITNEY HOUSTON', 'VAN HALEN', 'FLEETWOOD MAC', 'CELINE DION', 'TAYLOR SWIFT', 'U2', 'NEIL DIAMOND', 'ALABAMA', 'JOURNEY', 'KENNY G', 'SHANIA TWAIN', 'KENNY ROGERS', 'BOB SEGER & THE SILVER BULLET BAND', "GUNS N' ROSES", 'ALAN JACKSON', 'SANTANA', 'QUEEN', 'REBA MC ENTIRE', 'ERIC CLAPTON', 'TIM MCGRAW', 'CHICAGO', 'SIMON & GARFUNKEL', 'FOREIGNER', 'ROD STEWART', 'BACKSTREET BOYS', '2 PAC', 'BOB DYLAN', 'DEF LEPPARD', 'KENNY CHESNEY', 'BON JOVI', 'BRITNEY SPEARS', 'DAVE MATTHEWS BAND', 'THE DOORS', 'JOHN DENVER', 'PHIL COLLINS', 'JAMES TAYLOR', 'THE CHICKS', 'DRAKE', 'R. KELLY', 'PEARL JAM', 'TOM PETTY & THE HEARTBREAKERS', 'WILLIE NELSON', 'BOSTON', 'BEYONCE', 'LINDA RONSTADT', 'LINKIN 

In [15]:
df

Unnamed: 0,artist_name,certifications,gold_albums,platinum_albums,multiplatinum_albums,diamond_albums
0,THE BEATLES,183,48,42,26,6
1,GARTH BROOKS,157,31,31,17,9
2,ELVIS PRESLEY,139,101,57,25,1
3,EAGLES,120,13,13,11,3
4,LED ZEPPELIN,112.5,19,18,14,5
...,...,...,...,...,...,...
145,"CROSBY, STILLS, NASH & YOUNG",18,4,4,3,0
146,DESTINY'S CHILD,18,6,6,3,0
147,P!NK,18,8,8,5,0
148,RANDY TRAVIS,18,12,8,4,0


In [16]:
df.to_csv("RIAA_alltime_best.csv")

In [17]:
driver.close()