In [55]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from pandasql import sqldf

In [70]:
#Extracting/scraping an HTML site
url ='https://en.wikipedia.org/wiki/List_of_best-selling_singles'
page = requests.get(url)
soup = BeautifulSoup(page.text , 'html')

In [82]:
table = soup.find_all('table')[1]

In [7]:
world_titles = table.find_all('th')

In [72]:
world_table_titles = [title.text.strip() for title in world_titles] 

In [73]:
#display the Column names
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,Artist,Single,Released,Sales(in millions),Source


In [74]:
column_data = table.find_all('tr')

In [75]:
#itarating the the table rows to collect the information
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data

In [76]:
def clean_and_convert_sales(value):
    try:
        # Remove any non-numeric characters, and then convert to float
        value = float(value.replace('[disputed – discuss]', ''))
    except (ValueError, AttributeError):
        # Handle the case where the conversion fails
        value = None
    return value

# Apply the custom function to clean and convert the column
df['Sales(in millions)'] = df['Sales(in millions)'].apply(clean_and_convert_sales)

# Verify the data types after conversion
data_types = {'Artist': str, 'Single': str, 'Released': int, 'Sales(in millions)': float}
df = df.astype(data_types)
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, 0 to 30
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist              31 non-null     object 
 1   Single              31 non-null     object 
 2   Released            31 non-null     int32  
 3   Sales(in millions)  30 non-null     float64
 4   Source              31 non-null     object 
dtypes: float64(1), int32(1), object(3)
memory usage: 1.3+ KB


In [64]:
#subsetting the colums to extract the info from
df=df[['Artist','Single','Released','Sales(in millions)']]
df

Unnamed: 0,Artist,Single,Released,Sales(in millions)
0,The Andrews Sisters,"""Bei Mir Bist Du Schön""",1937,14.0
1,Scorpions,"""Wind of Change""",1991,14.0
2,Prince Nico Mbarga,"""Sweet Mother""",1976,13.0
3,Kyu Sakamoto,"""Sukiyaki""",1963,13.0
4,Gene Autry,"""Rudolph the Red-Nosed Reindeer""",1949,12.5
5,The Beatles,"""I Want to Hold Your Hand""",1963,12.0
6,Andrea Bocelli and Sarah Brightman,"""Time to Say Goodbye""",1996,12.0
7,Village People,"""Y.M.C.A.""",1978,12.0
8,Band Aid,"""Do They Know It's Christmas?""",1984,11.7
9,Los del Río,"""Macarena""",1995,11.0


In [65]:
#Extacting the most sold singles 
Most_sold=df.head(15)
Most_sold

Unnamed: 0,Artist,Single,Released,Sales(in millions)
0,The Andrews Sisters,"""Bei Mir Bist Du Schön""",1937,14.0
1,Scorpions,"""Wind of Change""",1991,14.0
2,Prince Nico Mbarga,"""Sweet Mother""",1976,13.0
3,Kyu Sakamoto,"""Sukiyaki""",1963,13.0
4,Gene Autry,"""Rudolph the Red-Nosed Reindeer""",1949,12.5
5,The Beatles,"""I Want to Hold Your Hand""",1963,12.0
6,Andrea Bocelli and Sarah Brightman,"""Time to Say Goodbye""",1996,12.0
7,Village People,"""Y.M.C.A.""",1978,12.0
8,Band Aid,"""Do They Know It's Christmas?""",1984,11.7
9,Los del Río,"""Macarena""",1995,11.0


In [81]:
#Diplay the Top 15 most sold single 
fig = px.pie(Most_sold, names="Artist", values="Sales(in millions)",title="Top 15 most sold single")

# Show the pie chart
fig.show()


In [78]:
# Create a histogram using Plotly Express
#fig = px.histogram(df, x='Released', nbins=10, title="Single Released ten years")
# Create the histogram with different colors for each bar
fig = px.histogram(df, x='Released', nbins=10, color='Released', title="Single Released ten years")
# Show the plot
fig.show()