In [None]:
import requests 
from bs4 import BeautifulSoup as bs 
import seaborn as sns
from scipy import stats
import scipy.stats as stats
import pandas as pd 
import matplotlib.pyplot as plt

r = requests.get('https://us.youtubers.me/global/all/top-1000-most-subscribed-youtube-channels') 
population = requests.get('https://www.worldometers.info/world-population/population-by-country') 
soup = bs(r.content)
populationSoup = bs(population.content)

In [None]:
# fetch the table data from website and store in variable locally

table = soup.select('table')[0] 
columns = table.find_all('th')

table1 = populationSoup.select('table')[0] 
column1s = table1.find_all('th')

In [None]:
# store the fetched data in the form of dataframe.

table_df = pd.read_html(str(table))[0]
table_df[table_df.isna().any(axis=1)]
table_df = table_df.dropna()
table_df['older'] = 2023 - table_df['started']
table_df

In [None]:
# store the fetched data in the form of dataframe.

table_df1 = pd.read_html(str(table1))[0]
table_df1[table_df1.isna().any(axis=1)]
table_df1 = table_df1.dropna()
table_df1 = table_df1[["Country (or dependency)", "Population (2020)"]]
table_df1.columns = ["country", "population"]
table_df1

In [None]:
wikiurls = {
            '2023':"https://en.wikipedia.org/w/index.php?title=List_of_most-subscribed_YouTube_channels&oldid=1137714711",
            '2022':"https://en.wikipedia.org/w/index.php?title=List_of_most-subscribed_YouTube_channels&oldid=1130583610",
            '2021':"https://en.wikipedia.org/w/index.php?title=List_of_most-subscribed_YouTube_channels&oldid=1063042127",
            }

table_class="wikitable sortable jquery-tablesorter"

wikiResponseArray = [[year, requests.get(wikiurls[year])] for year in wikiurls]
print(wikiResponseArray)

In [None]:
# fetch the table data from website, remove the columns which are not required and store in variable locally

def getDataFrame(data):    
    soup = bs(data.text, 'html.parser')
    indiatable=soup.find('table',{'class':"wikitable sortable"})
    df=pd.read_html(str(indiatable))
    df=pd.DataFrame(df[0]).head(49)
    df.columns= ["rank", "name","link", "brand channel" ,"subscribers", "primary language", "category" ,"country"]
    df['subscribers'] = df['subscribers'].astype(str).str.replace('$', '',regex=True).astype('float')
    df['primary language'] = df['primary language'].str.replace('[^a-zA-Z]', '',regex=True)
    return df

df = getDataFrame(wikiResponseArray[0][1])
df_2022 = getDataFrame(wikiResponseArray[1][1])
df

## Analysis 1:- How subscribers count correlates to video views of a video in youtube.

In [None]:
sns.regplot(data=table_df, x='subscribers', y='video views')

In [None]:
# Pearson correlation coefficient r
(r,p) = stats.pearsonr(table_df.subscribers, table_df['video views'])
print('r =', r, 'p =', p)

## Analysis 2:- How number of subscribers get varied along with category, country and primary language.

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

plt.xticks(rotation='vertical', fontsize=12)
plt.xlabel('category', fontsize=18)
plt.ylabel('subscribers', fontsize=18)

categoryToSubscribers = table_df.groupby(['category']).agg({'subscribers':'sum'}).reset_index()
sns.barplot(data=categoryToSubscribers, x='category', y='subscribers')



fig, ax = plt.subplots(figsize=(15, 6))

plt.xticks(rotation='vertical', fontsize=12)
plt.xlabel('Country', fontsize=18)
plt.ylabel('subscribers', fontsize=18)

countryToSubscribers = df.groupby(['country']).agg({'subscribers':'sum'}).reset_index()
sns.barplot(data=countryToSubscribers, x='country', y='subscribers')



fig, ax = plt.subplots(figsize=(15, 6))

plt.xticks(rotation='vertical', fontsize=12)
plt.xlabel('Country', fontsize=18)
plt.ylabel('subscribers', fontsize=18)

countryToSubscribers = df.groupby(['primary language']).agg({'subscribers':'sum'}).reset_index()
sns.barplot(data=countryToSubscribers, x='primary language', y='subscribers')


## Analysis 3 :- Countries having more brand channels.

In [None]:
df['rank'] = df['rank'].astype(str).str.replace('$', '',regex=True).astype('int64')
table_df['rank'] = table_df['rank'].astype(str).str.replace('$', '',regex=True).astype('int64')

mergedTable = pd.merge(df.head(48), table_df, on='rank')

catplot_tick = sns.catplot(x='country', hue='brand channel', data=mergedTable, kind='count', height=10, aspect=2)
catplot_tick.set_xlabels('country', fontsize=20) # not set_label
catplot_tick.set_ylabels('brand channel count', fontsize=20)

## Analysis 4 :- How population of country correlates to subscribers in youtube channel.

In [None]:
mergedTable1 = pd.merge(table_df1, df, on='country')

populationToSubscribers = mergedTable1.groupby(['country']).agg({'subscribers':'sum','population':'sum'}).reset_index()

fig, ax = plt.subplots(figsize=(20, 10))

plt.xlabel('country', fontsize=18)
plt.ylabel('population', fontsize=18)

sns.barplot(data=mergedTable1, x='country', y='population')


In
fig, ax = plt.subplots(figsize=(20, 10))

plt.xlabel('subscribers', fontsize=18)
plt.ylabel('population', fontsize=18)

sns.regplot(x='subscribers', y='population', data=populationToSubscribers)



fig, ax = plt.subplots(figsize=(15, 6))

plt.xticks(rotation='vertical', fontsize=12)
plt.xlabel('country', fontsize=18)
plt.ylabel('subscribers', fontsize=18)


countryToSubscribers = mergedTable1.groupby(['country']).agg({'subscribers':'sum'}).reset_index()
sns.barplot(data=countryToSubscribers.sort_values(by='subscribers', ascending=False), x='country', y='subscribers')

In [None]:
# Pearson correlation coefficient r
(r,p) = stats.pearsonr(populationToSubscribers.subscribers, populationToSubscribers.population)
print('r =', r, 'p =', p)

## Analysis 5:- How subscribers count depends on how older the channel is and video views.

In [None]:
sns.scatterplot(data=table_df[table_df['older'] <= 20].sample(frac=0.1), x='subscribers', y='older')

In [None]:
sns.scatterplot(x='subscribers', y='video views', data=table_df)