# Web Scraping a la página de Chartmasters

URL: 'https://chartmasters.org/most-streamed-artists-ever-on-spotify/'

Se realiza Web Scraping con Beautiful Soup a la página de Chartmasters para sacar un DataFrame con los artistas con más reproducciones en Spotify.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Hacer la solicitud HTTP
url = 'https://chartmasters.org/most-streamed-artists-ever-on-spotify/'
response = requests.get(url)

# Analizar el contenido HTML de la página web
soup = BeautifulSoup(response.content, 'html.parser')

# Encontrar la tabla de artistas
table = soup.find('table')

# Crear una lista vacía para almacenar los datos
data = []

# Encontrar todas las filas de la tabla, excepto la primera (que contiene los encabezados de las columnas)
rows = table.find_all('tr')[1:]

# Iterar a través de cada fila y obtener los datos
for row in rows:
    cols = row.find_all('td')[2:]
    cols = [col.text.strip() for col in cols]
    data.append(cols)

In [4]:
data

[['Drake2 day(s) old data',
  '56,194,988,475',
  '282',
  '7',
  '148',
  '275',
  '279',
  '20,329,715,771'],
 ['Bad Bunny2 day(s) old data',
  '51,189,983,147',
  '162',
  '9',
  '121',
  '157',
  '160',
  '7,541,244,342'],
 ['Taylor SwiftUp to date data',
  '42,204,571,789',
  '353',
  '2',
  '128',
  '277',
  '341',
  '471,196,255'],
 ['Ed SheeranUp to date data',
  '41,120,157,022',
  '251',
  '11',
  '67',
  '187',
  '228',
  '3,082,734,708'],
 ['The Weeknd1 day(s) old data',
  '39,704,788,932',
  '219',
  '10',
  '80',
  '148',
  '185',
  '6,540,530,184'],
 ['Justin BieberUp to date data',
  '35,225,977,864',
  '226',
  '8',
  '61',
  '176',
  '197',
  '7,677,942,155'],
 ['Ariana GrandeUp to date data',
  '35,000,764,144',
  '183',
  '9',
  '73',
  '132',
  '178',
  '2,215,911,852'],
 ['EminemUp to date data',
  '33,760,538,200',
  '285',
  '6',
  '67',
  '229',
  '264',
  '4,567,249,852'],
 ['BTSUp to date data',
  '32,095,643,864',
  '263',
  '3',
  '105',
  '214',
  '263',
 

In [5]:
# Crear el dataframe con los datos obtenidos
df = pd.DataFrame(data, columns=['Artist', 'Lead Stream', 'Tracks', '1b', '100m', '10m', '1m', 'Feat Streams'])

# Imprimir el dataframe
print(df)

                                 Artist     Lead Stream Tracks  1b 100m  10m  \
0                Drake2 day(s) old data  56,194,988,475    282   7  148  275   
1            Bad Bunny2 day(s) old data  51,189,983,147    162   9  121  157   
2           Taylor SwiftUp to date data  42,204,571,789    353   2  128  277   
3             Ed SheeranUp to date data  41,120,157,022    251  11   67  187   
4           The Weeknd1 day(s) old data  39,704,788,932    219  10   80  148   
..                                  ...             ...    ...  ..  ...  ...   
95             BLACKPINKUp to date data   8,946,547,337    104   0   24   49   
96  Tyler, The Creator2 day(s) old data   8,921,201,314    152   0   25   93   
97        James Arthur2 day(s) old data   8,894,424,356    154   1   17   68   
98     Sebastian Yatra2 day(s) old data   8,868,875,753    119   0   25   73   
99         Frank Ocean2 day(s) old data   8,827,904,295     45   0   23   45   

     1m    Feat Streams  
0   279  20,3

In [6]:
import re #Importamos expresiones regulares para poder limpiar la columna artistas.

df['Artist'] = df['Artist'].apply(lambda x: re.sub('\d+\sday\(s\) old data', '', x))
df['Artist'] = df['Artist'].str.replace('Up to date data', '')

# Imprimir el dataframe resultante
print(df)

                Artist     Lead Stream Tracks  1b 100m  10m   1m  \
0                Drake  56,194,988,475    282   7  148  275  279   
1            Bad Bunny  51,189,983,147    162   9  121  157  160   
2         Taylor Swift  42,204,571,789    353   2  128  277  341   
3           Ed Sheeran  41,120,157,022    251  11   67  187  228   
4           The Weeknd  39,704,788,932    219  10   80  148  185   
..                 ...             ...    ...  ..  ...  ...  ...   
95           BLACKPINK   8,946,547,337    104   0   24   49  104   
96  Tyler, The Creator   8,921,201,314    152   0   25   93  110   
97        James Arthur   8,894,424,356    154   1   17   68  134   
98     Sebastian Yatra   8,868,875,753    119   0   25   73  108   
99         Frank Ocean   8,827,904,295     45   0   23   45   45   

      Feat Streams  
0   20,329,715,771  
1    7,541,244,342  
2      471,196,255  
3    3,082,734,708  
4    6,540,530,184  
..             ...  
95     559,319,185  
96   1,887,276,

In [7]:
df.to_csv('Most_Streaming_Spotify.csv', sep='|')