<a href="https://www.kaggle.com/code/bencaiello/top-1000-yt-ers-w-language-translation-english?scriptVersionId=143564267" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()


# Install translation package (only need to do once!)
try:
    import translators as ts
except:
    !pip install translators
    import translators as ts

import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = (os.path.join(dirname, filename))


Note that this first step may run slowly if you copy this notebook, as a download from pip is used for the translation package. This only should occur on the first run of the notebook.

# Introduction & First Look at Data!

There is also so data cleaning / rearranging going on insde the hidden code block -- expand to check!

In [None]:
file = pd.read_csv(filepath)

# Inspect the nture of the data in each column, the number of null values and the datatypes of the columns
display(file.head())
display(file.info())

#  The numerical columsn do not need to be floats as they represent. 
# Note: trying to read these columns (3,5-7) directly as 'int' type did not succeed.
for i in file.columns:
    try:
        file[i] = file[i].astype('int')
    except:
        pass

# The Country column, although having no null values has 'Unknown' values
# Will convert the nulls in categories to 'Unknown' for consistency.
# Cannot drop nulls becaues there are too many in the categories column
file['Categories'] = file['Categories'].fillna('Unknown')

# Additionally, I will drop the links column (at least for now), as it does not provide useful information for most visualizations.
file = file.drop('Links',axis = 1)

# I will also create a column of subscribers in millions (easier to plot)
file['Subscribers (millions)'] = file['Suscribers'] / 1000000

# Translation of Category and Country columns to English

Along the way, I also convert the category column into a column of lists (instead of a column of strings), as each channel can have more than one category.

In [None]:
# I am a monolingual English speaker -- so let's convert the country / category names into English!
# Link to package:  https://pypi.org/project/translate-api/
# Note that documentation at the provided link is inaccurate!

# First, identify the categories in the Categories columns. Note that some channels are more than one category.
cat_list = []
for i in file['Categories']:
    split_list = []
    split_list = i.split(',')
    for j in split_list:
        j = j.strip()
        cat_list.append(j)

# Now I isolate the unique categories, by using the set datatype:
cat_set = set(cat_list)


# Next, I convert the unique categories into a single string of each category, separated by commas.
# This is important to reduce the number of queries to the translation tool / URL to a minimum.
# The translate_text function throws an error if you query the same URL more than ~7 times in rapid succession.
cat_list = list(cat_set)
cat_str = ''
for i in cat_list:
    cat_str = cat_str + i + ','
    
# Now we translate:
english = ts.translate_text(cat_str)

# Then undo the single string back into a list:
english_list = english.split(',')

# Then make a dictionary to match the Spanish phrases to the English translations.
trans_dict = {}
for i,ii in enumerate(cat_list):
    trans_dict[ii] = english_list[i]
    
# Next we translate the column using the dictionary:
translation_list = []
for i in file['Categories']:
    entry = i.split(',')
    entry_list = []
    for j in entry:
        j = j.strip()
        j = trans_dict[j]
        entry_list.append(j)
    translation_list.append(entry_list)
file['Categories'] = translation_list

# Countries are much simpler, as they only have singular values per entry:
# A similar process is followed as above, with fewer steps. 
# Consult the comments above if you want to understand why each step is taken
country_list = list(file['Country'].unique())
country_str = ''
for i in country_list:
    country_str = country_str + i + ','
english_c = ts.translate_text(country_str)
english_list_c = english_c.split(',')
trans_dict = {}
for i,ii in enumerate(country_list):
    trans_dict[ii] = english_list_c[i]
file['Country'] = file['Country'].replace(trans_dict)

display(file.head())

# Make dummy variables for every category & begin to plot!

In [None]:
# Make dummy variable columns!
english_list = english_list[0:-1]
for i in english_list:
    T_F_list = []
    for j in file['Categories']:
        if i in j:
            T_F_list.append(1)
        else:
            T_F_list.append(0)
    file[i] = T_F_list
display(file.head())

# change likes / comments in main dataframe so they can be plotted as log-values:
file['Likes'] = (file['Likes'] + 0.01) / 10000
file['Comments'] = (file['Comments'] + 0.001) / 1000
file['Visits'] = (file['Visits'] + 0.001) / 10000

# Look at distribution of numerical variables:
dist_list = ['Subscribers (millions)','Likes','Comments','Visits']
dist_df = file[dist_list]

sns.boxplot(dist_df)
plt.title('Distribution of Subs (in millions), Likes (1000s), Comments (1000s), & Visits (1000s)')
plt.show()

print('\n The data is extremely skewed! \n')

sns.boxplot(dist_df)
plt.title('Distribution of Subs (in millions), Likes (1000s), Comments (1000s), & Visits (1000s) on Log scale')
plt.yscale('log')
plt.show()

print('\n Will use log distribution when plotting likes/comments! \n')



# Categories: Counts and AVG Subscribers by category

In [None]:
# Make a data frame with all the categories of channel
# Note that this has to be a new dataframe, as it will bel onger than 1000

cat_counts = pd.DataFrame()
cat_list = []
for i in file['Categories']:
    for j in i:
        cat_list.append(j)       
cat_counts['Categories'] = cat_list
cat_count_ordered_list = cat_counts['Categories'].value_counts().index

sns.countplot(x=cat_counts['Categories'], order = cat_count_ordered_list)
plt.xticks(rotation=90)
plt.title('Channel Counts of Each Category')
plt.xlabel(None)
plt.show()

loop_dict = {}
i = 10
while i < 33:
    loop_list = file[file[file.columns[i]] == 1]['Subscribers (millions)']
    loop_dict[file.columns[i]] = loop_list
    i += 1
loop_dict['Overall'] = file['Subscribers (millions)']

cats = pd.DataFrame(loop_dict)
cat_order = cats.mean().sort_values().index
ov_mean = loop_dict['Overall'].mean()


sns.barplot(cats, order = list(cat_order),color = 'r')
plt.xticks(rotation = 90)
plt.title('Average Subscribers by Category')
plt.ylabel('AVG Subscribers (Millions)')
plt.hlines(ov_mean,xmin = 0, xmax = 24, linestyles = 'dashed' )
plt.annotate('Avg subs',xy=(0,ov_mean),size = 10)
plt.show()

**Only four categories outperform the overall average: Toys, Music & Dance, Education, Video Games, and Animation!**

# Now by Country!

In [None]:
country_counts = file['Country'].value_counts().index

sns.countplot(x=file['Country'], order = country_counts)
plt.xticks(rotation=90)
plt.title('Channel Counts of Each Country')
plt.xlabel(None)
plt.show()

avg_subs_per_country_list = file.groupby('Country')['Subscribers (millions)'].mean().sort_values().index

sns.barplot(file, x = file['Country'], y = 'Subscribers (millions)', order = list(avg_subs_per_country_list),color = 'r')
plt.xticks(rotation = 90)
plt.title('Average Subscribers by Category')
plt.ylabel('AVG Subscribers (Millions)')
plt.hlines(ov_mean,xmin = 0, xmax = 29, linestyles = 'dashed' )
plt.annotate('Avg subs',xy=(0,ov_mean + 1),size = 10)
plt.show()

**Once again, only a few countries exceed the average subs / channel! Likely the high number of India and 'Unknown' country channels and their relatively higher subscribed count are pulling up the overall average.**

# Test the Pareto Distribution! Do 20% of the channels have 80% of the Subscribers?


The Pareto distribution is a principle about how the top few percent possess or produce the majority of a given resource -- perhaps in this case, something like YT subscribers?

One name for this is the "80-20" rule, aka that 80 of the given resource (the principle was originally discovered in terms of wealth) is held by only 20% people. 

[Follow this link to see a discussion of this principle, & as a source!](https://dlab.berkeley.edu/news/explaining-80-20-rule-pareto-distribution#:~:text=The%20Pareto%20distribution%20is%20a%20power%2Dlaw%20probability%20distribution%2C%20and,sloped%20(see%20Figure%201).)

One wrinkle here, is that we are looking at the top 1000 channels, not all the channels on Youtube.

In [None]:
plt.bar(x=file.index,height=file['Subscribers (millions)'],edgecolor = 'b',color = 'b')
plt.vlines(200, ymin = 0, ymax = 250,color = 'r')
plt.annotate('Top 20%',xy = (10,250), size = 10)
plt.annotate('Lower 80%',xy = (250,250), size = 10)
plt.title('Distribution of Subs in order of rank')
plt.ylabel('Subscribers (millions)')
plt.xlabel('Rank')
plt.show()

file['Subscribers cumulative (mil)'] = file['Subscribers (millions)'].cumsum()

sns.displot(y = file['Subscribers cumulative (mil)'], kind = 'ecdf')
plt.vlines(0.2, ymin = 0, ymax = file['Subscribers cumulative (mil)'][999],color = 'r')
plt.annotate('Top 20%',xy = (0,21000), size = 10)
plt.annotate('Lower 80%',xy = (0.21,21000), size = 10)
plt.xlabel('Proportion of Channels included in Cumulative Subscribers')
plt.title('Cumulative Subscribers over top 1000 YT channels')
plt.show()

twenty = file['Subscribers cumulative (mil)'][199]
total = file['Subscribers cumulative (mil)'][999]
eighty = total - twenty

print('Number of Subscribers from top 200 channels (in millions): ', round(twenty, 3))
print('% of total: ', round(twenty / total, 3) * 100)

**Answer:**

No! here 20% of the top 1000 YT channels have 40% of their subscribers, not 80%!

# What about comments and likes?

Let's look at how engagement per video tracks with total subscribers!

Note that subscribers are in millions, while the others are in units of 1,000s before the log transformation.

In [None]:
# Plot these columns after log transformation
file['log Likes'] = np.log(file['Likes'])
file['log Comments'] = np.log(file['Comments'])
file['log Subs'] = np.log(file['Subscribers (millions)'])
file['log Visits'] = np.log(file['Visits'])

In [None]:
corr = np.corrcoef(file['log Likes'],file['log Subs'])
correlation = 'corr = ' + str(round(corr[0][1],2))


sns.regplot(file,x='log Subs',y = 'log Likes', ci = None, line_kws = {'color':'r', 'linestyle':'dashed'})
plt.title('Likes per video vs. Subscribers (logarithmic scale)')
plt.annotate(correlation, xy = (5,0.1), size = 10)
plt.show()

corr = np.corrcoef(file['log Comments'],file['log Subs'])
correlation = 'corr = ' + str(round(corr[0][1],2))

sns.regplot(file,x='log Subs',y = 'log Comments', ci = None, line_kws = {'color':'r', 'linestyle':'dashed'})
plt.title('Comments per video vs. Subscribers (logarithmic scale)')
plt.annotate(correlation, xy = (4.5,-7.5), size = 10)
plt.show()

corr = np.corrcoef(file['log Visits'],file['log Subs'])
correlation = 'corr = ' + str(round(corr[0][1],2))

sns.regplot(file,x='log Subs',y = 'log Visits', ci = None, line_kws = {'color':'r', 'linestyle':'dashed'})
plt.title('Visits per video vs. Subscribers (logarithmic scale)')
plt.annotate(correlation, xy = (4.75,2), size = 10)
plt.show()


corr = np.corrcoef(file['log Comments'],file['log Likes'])
correlation = 'corr = ' + str(round(corr[0][1],2))

sns.regplot(file,x='log Likes',y = 'log Comments', ci = None, line_kws = {'color':'r', 'linestyle':'dashed'})
plt.title('Comments per video vs. Likes (logarithmic scale)')
plt.annotate(correlation, xy = (-12,-10), size = 10)
plt.show()

corr = np.corrcoef(file['log Visits'],file['log Likes'])
correlation = 'corr = ' + str(round(corr[0][1],2))

sns.regplot(file,x='log Likes',y = 'log Visits', ci = None, line_kws = {'color':'r', 'linestyle':'dashed'})
plt.title('Visits per video vs. Likes (logarithmic scale)')
plt.annotate(correlation, xy = (-12,-10), size = 10)
plt.show()


**Per Video Likes and Visits / Comments correlate well with each other -- but these do not strongly correlate with subscribe count!**