# Web Scraping a la página de Chartmasters

URL: "https://chartmasters.org/best-selling-artists-of-all-time/"

Se realiza Web Scraping con Beautiful Soup a la página de Chartmasters para extraer un DataFrame sobre los artistas que más han vendido.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [9]:
# Hacemos la petición HTTP
url = "https://chartmasters.org/best-selling-artists-of-all-time/"
response = requests.get(url)

# Parseamos el HTML con BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Obtenemos la tabla con la información requerida
table = soup.find("table", {"class": "resultsTab"})

# Creamos una lista vacía para almacenar los datos de cada fila
data = []

# Recorremos todas las filas de la tabla y extraemos la información de las columnas
for row in table.find_all("tr")[1:]:
    cells = row.find_all("td")
    artist = cells[2].text.strip()
    total_cspc = cells[3].text.strip()
    studio_albums_sales = cells[4].text.strip()
    other_lps_sales = cells[5].text.strip()
    physical_singles_sales = cells[6].text.strip()
    digital_singles_sales = cells[7].text.strip()
    sales_update = cells[8].text.strip()
    streams_eas_update = cells[9].text.strip()
    
    # Añadimos los datos de la fila a la lista
    data.append({
        "Artist": artist,
        "Total CSPC": total_cspc,
        "Studio Albums Sales": studio_albums_sales,
        "Other LPs Sales": other_lps_sales,
        "Physical Singles Sales": physical_singles_sales,
        "Digital Singles Sales": digital_singles_sales,
        "Sales Update": sales_update,
        "Streams EAS (Update)": streams_eas_update
    })

In [7]:
data

[{'Artist': 'The Beatles',
  'Total CSPC': '421,855,000',
  'Studio Albums Sales': '160,650,000',
  'Other LPs Sales': '203,392,000',
  'Physical Singles Sales': '116,080,000'},
 {'Artist': 'Michael Jackson',
  'Total CSPC': '336,644,000',
  'Studio Albums Sales': '182,600,000',
  'Other LPs Sales': '101,997,000',
  'Physical Singles Sales': '79,350,000'},
 {'Artist': 'Elvis Presley',
  'Total CSPC': '320,348,000',
  'Studio Albums Sales': '53,150,000',
  'Other LPs Sales': '212,309,000',
  'Physical Singles Sales': '135,210,000'},
 {'Artist': 'Queen',
  'Total CSPC': '277,454,000',
  'Studio Albums Sales': '90,230,000',
  'Other LPs Sales': '139,348,000',
  'Physical Singles Sales': '49,960,000'},
 {'Artist': 'Madonna',
  'Total CSPC': '246,817,000',
  'Studio Albums Sales': '146,450,000',
  'Other LPs Sales': '64,420,000',
  'Physical Singles Sales': '75,210,000'},
 {'Artist': 'The Rolling Stones',
  'Total CSPC': '243,426,000',
  'Studio Albums Sales': '113,800,000',
  'Other LPs Sa

In [10]:
# Creamos un DataFrame a partir de la lista de datos
df = pd.DataFrame(data)

# Mostramos el DataFrame
print(df)

                  Artist   Total CSPC Studio Albums Sales Other LPs Sales  \
0            The Beatles  421,855,000         160,650,000     203,392,000   
1        Michael Jackson  336,644,000         182,600,000     101,997,000   
2          Elvis Presley  320,348,000          53,150,000     212,309,000   
3                  Queen  277,454,000          90,230,000     139,348,000   
4                Madonna  246,817,000         146,450,000      64,420,000   
..                   ...          ...                 ...             ...   
166            Lil Nas X   12,630,000              58,000               0   
167     Carly Rae Jepsen   11,566,000           1,777,000          82,000   
168  Buffalo Springfield   11,306,000           2,885,000       6,698,000   
169        Lewis Capaldi   10,443,000             738,000           3,000   
170          Blind Faith   10,093,000           5,395,000       4,363,000   

    Physical Singles Sales Digital Singles Sales Sales Update  \
0         

In [5]:
type(df)

pandas.core.frame.DataFrame

In [11]:
df.to_csv('Best_Sellings_All_Time.csv', sep='|')