# Explore here

It's recommended to use this notebook for exploration purposes.

In [52]:
pip install pandas requests lxml

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import io


In [54]:


url = "https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records"

headers = {
    "User-Agent": "Mozilla/5.0 (compatible; DataScienceProject/1.0; +https://example.com)"
}

response = requests.get(url, headers=headers)

response.raise_for_status()

print("Status:", response.status_code)

Status: 200


In [55]:

html = io.StringIO(response.text)

tables = pd.read_html(html)
print(f"{len(tables)} tables were found.")

26 tables were found.


In [56]:

df = tables[0]  # Extract the first table from the 27 found
df.head()  # Display the first 5 rows

Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,29 November 2019,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,6 January 2017,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,3 December 2012,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,21 September 2016,[4]
4,5,"""As It Was""",Harry Styles,4.256,1 April 2022,[5]


In [57]:
df.columns

Index(['Rank', 'Song', 'Artist(s)', 'Streams (billions)', 'Release date',
       'Ref.'],
      dtype='str')

In [58]:
df

Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,29 November 2019,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,6 January 2017,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,3 December 2012,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,21 September 2016,[4]
4,5,"""As It Was""",Harry Styles,4.256,1 April 2022,[5]
...,...,...,...,...,...,...
96,97,"""Dreams""",Fleetwood Mac,2.554,4 February 1977,[97]
97,98,"""Sicko Mode""",Travis Scott and Drake,2.535,21 August 2018,[98]
98,99,"""Billie Jean""",Michael Jackson,2.532,29 November 1982,
99,100,"""Someone Like You""",Adele,2.528,24 January 2011,[99]


In [59]:
df = df.drop(100)
df


Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,29 November 2019,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,6 January 2017,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,3 December 2012,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,21 September 2016,[4]
4,5,"""As It Was""",Harry Styles,4.256,1 April 2022,[5]
...,...,...,...,...,...,...
95,96,"""Happier""",Marshmello and Bastille,2.558,16 August 2018,[96]
96,97,"""Dreams""",Fleetwood Mac,2.554,4 February 1977,[97]
97,98,"""Sicko Mode""",Travis Scott and Drake,2.535,21 August 2018,[98]
98,99,"""Billie Jean""",Michael Jackson,2.532,29 November 1982,


In [60]:
df["Release date"] = pd.to_datetime(df["Release date"], errors="coerce")
df["Streams (billions)"] = df["Streams (billions)"].astype(float)
df


Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,2019-11-29,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,2017-01-06,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,2012-12-03,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,2016-09-21,[4]
4,5,"""As It Was""",Harry Styles,4.256,2022-04-01,[5]
...,...,...,...,...,...,...
95,96,"""Happier""",Marshmello and Bastille,2.558,2018-08-16,[96]
96,97,"""Dreams""",Fleetwood Mac,2.554,1977-02-04,[97]
97,98,"""Sicko Mode""",Travis Scott and Drake,2.535,2018-08-21,[98]
98,99,"""Billie Jean""",Michael Jackson,2.532,1982-11-29,


In [61]:
conn = sqlite3.connect("spotify_top_songs.db")

In [62]:
df.to_sql("most_streamed", conn, if_exists="replace", index=False)
cursor = conn.cursor()

In [63]:
cursor.execute("SELECT COUNT(*) FROM most_streamed")
print("Rows inserted:", cursor.fetchone()[0])

conn.commit()


Rows inserted: 100


In [64]:
df


Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,2019-11-29,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,2017-01-06,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,2012-12-03,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,2016-09-21,[4]
4,5,"""As It Was""",Harry Styles,4.256,2022-04-01,[5]
...,...,...,...,...,...,...
95,96,"""Happier""",Marshmello and Bastille,2.558,2018-08-16,[96]
96,97,"""Dreams""",Fleetwood Mac,2.554,1977-02-04,[97]
97,98,"""Sicko Mode""",Travis Scott and Drake,2.535,2018-08-21,[98]
98,99,"""Billie Jean""",Michael Jackson,2.532,1982-11-29,


In [65]:
import sqlite3


In [67]:
top_10_df = pd.read_sql("""
    SELECT * 
    FROM most_streamed 
    ORDER BY "Streams (billions)" DESC 
    LIMIT 10
""", conn)

top_10_df

Unnamed: 0,Rank,Song,Artist(s),Streams (billions),Release date,Ref.
0,1,"""Blinding Lights""",The Weeknd,5.286,2019-11-29 00:00:00,[1]
1,2,"""Shape of You""",Ed Sheeran,4.777,2017-01-06 00:00:00,[2]
2,3,"""Sweater Weather""",The Neighbourhood,4.393,2012-12-03 00:00:00,[3]
3,4,"""Starboy""",The Weeknd and Daft Punk,4.374,2016-09-21 00:00:00,[4]
4,5,"""As It Was""",Harry Styles,4.256,2022-04-01 00:00:00,[5]
5,6,"""Someone You Loved""",Lewis Capaldi,4.229,2018-11-08 00:00:00,[6]
6,7,"""Sunflower""",Post Malone and Swae Lee,4.127,2018-10-18 00:00:00,[7]
7,8,"""One Dance""",Drake with Wizkid and Kyla,4.048,2016-04-05 00:00:00,[8]
8,9,"""Perfect""",Ed Sheeran,3.836,2017-03-03 00:00:00,[9]
9,10,"""Stay""",The Kid Laroi and Justin Bieber,3.803,2021-07-09 00:00:00,[10]


In [68]:
conn.close()