In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date


In [2]:
df = pd.read_csv('AI_ML_YT_Videos.csv',index_col=0, header=0)
df.head()

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments
0,Jeff Heaton,LSTM-Based Time Series with PyTorch (10.2),2023-10-27,764,45,1
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1
3,Jeff Heaton,Creating Certificates to Deploy PyInstaller Py...,2023-10-17,439,12,0
4,Jeff Heaton,How Should you Architect Your PyTorch Neural N...,2023-10-12,825,39,1


In [3]:
len(df)

6151

There are 6151 videos in the dataset

### Cutting down the dataset to include videos uploaded starting from 2018

In [9]:

# Convert 'PublishedDate' column to datetime
df['PublishedDate'] = pd.to_datetime(df['PublishedDate']).dt.date


In [10]:
df.dtypes

Channel          object
Title            object
PublishedDate    object
Views             int64
Likes             int64
Comments          int64
dtype: object

In [11]:
specific_date = date(2018,1,1)
df_subset = df[df['PublishedDate'] > specific_date]
df_subset

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments
0,Jeff Heaton,LSTM-Based Time Series with PyTorch (10.2),2023-10-27,764,45,1
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1
3,Jeff Heaton,Creating Certificates to Deploy PyInstaller Py...,2023-10-17,439,12,0
4,Jeff Heaton,How Should you Architect Your PyTorch Neural N...,2023-10-12,825,39,1
...,...,...,...,...,...,...
6141,Krish Naik,Principle Component Analysis (PCA) using sklea...,2018-07-02,190122,3316,149
6142,Krish Naik,PySpark Tutorial for Beginners | Apache Spark ...,2018-06-06,39462,613,42
6143,Krish Naik,Creating a Dataset and training an Artificial ...,2018-02-25,25082,244,34
6144,Krish Naik,Artificial Neural Network Intuition,2018-02-24,36519,233,7


In [12]:
df_subset.to_csv('dataset.csv')

The new subset dataset contains 4081 rows. A almost ~2000 row cut from the original dataset.

In [31]:
df_subset.columns

Index(['Channel', 'Title', 'PublishedDate', 'Views', 'Likes', 'Comments'], dtype='object')

In [33]:
df_subset.dtypes

Channel          object
Title            object
PublishedDate    object
Views             int64
Likes             int64
Comments          int64
dtype: object

In [17]:
df_subset.isnull().sum()

Channel          0
Title            0
PublishedDate    0
Views            0
Likes            0
Comments         0
dtype: int64

There are no missing values in the dataset

Which channel has the oldest video?

In [34]:
sorted_df_asc = df_subset.sort_values(by='PublishedDate')
sorted_df_asc

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments
1622,Siraj Raval,How Does Cardano Work?,2018-01-04,59416,1735,224
1621,Siraj Raval,Keras Explained,2018-01-06,244875,4153,231
832,Daniel Bourke,Computer Vision Basics + More deeplearning.ai ...,2018-01-07,659,22,10
3527,Sentdex,Deep Learning - Halite II 2017 Artificial Inte...,2018-01-09,13830,364,38
3526,Sentdex,Training Data - Halite II 2017 Artificial Inte...,2018-01-10,9682,241,55
...,...,...,...,...,...,...
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1
2842,DeepLearningAI,Mitigating LLM Hallucinations with a Metrics-F...,2023-10-26,8474,457,18
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1
4482,Krish Naik,Amazing Langchain Series With End To End Proje...,2023-10-27,15382,462,45


`Siraj Raval` has the oldest videos titled `How Does Cardano Work` and `Keras Explained`.Let us now check which channel has the newest upload in the dataset.

In [35]:
sorted_df_desc = df_subset.sort_values(by='PublishedDate',ascending=False)
sorted_df_desc

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments
0,Jeff Heaton,LSTM-Based Time Series with PyTorch (10.2),2023-10-27,764,45,1
4482,Krish Naik,Amazing Langchain Series With End To End Proje...,2023-10-27,15382,462,45
2842,DeepLearningAI,Mitigating LLM Hallucinations with a Metrics-F...,2023-10-26,8474,457,18
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1
...,...,...,...,...,...,...
3526,Sentdex,Training Data - Halite II 2017 Artificial Inte...,2018-01-10,9682,241,55
3527,Sentdex,Deep Learning - Halite II 2017 Artificial Inte...,2018-01-09,13830,364,38
832,Daniel Bourke,Computer Vision Basics + More deeplearning.ai ...,2018-01-07,659,22,10
1621,Siraj Raval,Keras Explained,2018-01-06,244875,4153,231


`Jedd Heaton` and `Krish Naik` have the newest video uploads.

### Which channel has the highest total number of video likes? 

In [42]:
# calculate likes sum, grouped by channel and year:


# Group by 'channelName' and 'Year' and sum the 'Likes'
result = df_subset.groupby(['Channel'])['Likes'].sum().reset_index()

# Rename the columns for clarity
result = result.rename(columns={'Likes': 'TotalLikes'})

result = result.sort_values(by='TotalLikes',ascending=False)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['PublishedDate'] = pd.to_datetime(df_subset['PublishedDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['Year'] = df_subset['PublishedDate'].dt.year


Unnamed: 0,Channel,TotalLikes
7,Krish Naik,2024462
10,Siraj Raval,634183
9,Sentdex,583985
8,Nicholas Renotte,415762
3,Daniel Bourke,218351
0,Applied AI Course,94809
5,Jeff Heaton,93613
4,DeepLearningAI,72420
6,Jeremy Howard,68573
2,Arxiv Insights,57618


`Krish Naik` has the highest total likes count, followed by `Siraj Raval`, and `Sentdex`

Now, we group by the year as well:


In [51]:



# Extract the year from the 'PublishedDate' column
df_subset['Year'] = df_subset['PublishedDate'].dt.year

# Group by 'channelName' and 'Year' and sum the 'Likes'
result = df_subset.groupby(['Channel','Year'])['Likes'].sum().reset_index()

# Rename the columns for clarity
result = result.rename(columns={'Likes': 'TotalLikes'})

result = result.sort_values(by=['TotalLikes','Year'],ascending=[False,True])
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['PublishedDate'] = pd.to_datetime(df_subset['PublishedDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['Year'] = df_subset['PublishedDate'].dt.year


Unnamed: 0,Channel,Year,TotalLikes
35,Krish Naik,2019,661132
36,Krish Naik,2020,570601
37,Krish Naik,2021,397369
51,Siraj Raval,2018,378872
38,Krish Naik,2022,287468
52,Siraj Raval,2019,164823
42,Nicholas Renotte,2021,145190
46,Sentdex,2019,144138
45,Sentdex,2018,139236
47,Sentdex,2020,111730


This table contains correct information, however, we could convery data in a better way

In [52]:

# Use pivot_table to pivot the 'Year' values into separate columns
pivoted_result = result.pivot_table(index='Channel', columns='Year', values='TotalLikes', fill_value=0)

# Reset the index to make 'channelName' a regular column
pivoted_result = pivoted_result.reset_index()

# Rename the columns for clarity
pivoted_result.columns.name = None

pivoted_result






Unnamed: 0,Channel,2018,2019,2020,2021,2022,2023
0,Applied AI Course,5884.0,13867.0,33571.0,34628.0,6753.0,106.0
1,Artificial Intelligence — All in One,1161.0,0.0,0.0,0.0,0.0,0.0
2,Arxiv Insights,42755.0,11357.0,0.0,3506.0,0.0,0.0
3,Daniel Bourke,17130.0,42258.0,71603.0,47130.0,37551.0,2679.0
4,DeepLearningAI,8973.0,3815.0,11558.0,13824.0,16248.0,18002.0
5,Jeff Heaton,1058.0,26463.0,18581.0,22048.0,14403.0,11060.0
6,Jeremy Howard,6938.0,7699.0,18395.0,3594.0,16744.0,15203.0
7,Krish Naik,8266.0,661132.0,570601.0,397369.0,287468.0,99626.0
8,Nicholas Renotte,0.0,1208.0,68956.0,145190.0,101756.0,98652.0
9,Sentdex,139236.0,144138.0,111730.0,58175.0,93657.0,37049.0


Some observations:
- The channel `Artificial Intelligence -- All in One` shows to have no likes count after 2018, which makes sense since the channel stopped uploading videos that same year.
- `Nicholas Renotte` joined Youtube on January the 26th 2019, hence, no likes count in 2018.
- Similarily, the channel `Arxiv Insights` having a zero likes count indicate not having video uploads in that same year.

In [53]:
df_subset.head(5)

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments,Year
0,Jeff Heaton,LSTM-Based Time Series with PyTorch (10.2),2023-10-27,764,45,1,2023
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1,2023
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1,2023
3,Jeff Heaton,Creating Certificates to Deploy PyInstaller Py...,2023-10-17,439,12,0,2023
4,Jeff Heaton,How Should you Architect Your PyTorch Neural N...,2023-10-12,825,39,1,2023


In [56]:
df_subset.describe()


Unnamed: 0,PublishedDate,Views,Likes,Comments,Year
count,4081,4081.0,4081.0,4081.0,4081.0
mean,2020-11-14 13:23:27.008086016,44304.36,1045.071551,74.880176,2020.399167
min,2018-01-04 00:00:00,0.0,0.0,0.0,2018.0
25%,2019-09-14 00:00:00,5028.0,129.0,11.0,2019.0
50%,2020-11-01 00:00:00,14334.0,376.0,33.0,2020.0
75%,2022-01-21 00:00:00,40660.0,1041.0,84.0,2022.0
max,2023-10-27 00:00:00,2689040.0,64750.0,3478.0,2023.0
std,,106538.1,2465.575101,147.49798,1.499081


In [57]:
views_stats_formatted = df_subset.describe()['Views'].apply(lambda x: '{:,.0f}'.format(x))
views_stats_formatted

count        4,081
mean        44,304
min              0
25%          5,028
50%         14,334
75%         40,660
max      2,689,040
std        106,538
Name: Views, dtype: object

Observations:
 
 - Average view count per video is 44,304 views.
 - The maximum views count for a video in the dataset is ~2,700,000 views.
 - 50% of educational AI channels videos get 14,334 views count.
 - The minimum number of views for a video (strongly) cannot be 0, let us investigate!

In [58]:
df_subset.loc[df['Views'] == 0]

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments,Year
665,Daniel Bourke,Are courses enough for a job? | Machine learni...,2020-10-04,0,0,0,2020
1360,Siraj Raval,Siraj Raval Live Stream,2021-12-27,0,0,0,2021
1493,Siraj Raval,All Hands Meeting,2018-10-25,0,45,0,2018
1501,Siraj Raval,Quantum Machine Learning (LIVE),2018-10-12,0,19,0,2018
1502,Siraj Raval,Quantum Machine Learning (LIVE),2018-10-12,0,14,0,2018
1505,Siraj Raval,Quantum Machine Learning (LIVE),2018-10-09,0,121,0,2018
2019,Nicholas Renotte,Nicholas Renotte Live Stream,2021-01-04,0,0,0,2021
2178,Jeremy Howard,Jeremy Howard Live Stream,2022-04-23,0,0,0,2022
3028,DeepLearningAI,DeepLearningAI Live Stream,2021-03-23,0,0,0,2021
3483,Sentdex,"Sentdex Live - Hangout, news, hackerrank",2018-06-02,0,0,0,2018


It seems that Youtube API does not store views count for livestreams as well as comments count.

In [59]:
df_subset.head()

Unnamed: 0,Channel,Title,PublishedDate,Views,Likes,Comments,Year
0,Jeff Heaton,LSTM-Based Time Series with PyTorch (10.2),2023-10-27,764,45,1,2023
1,Jeff Heaton,"Time Series Data Encoding for Deep Learning, P...",2023-10-26,530,31,1,2023
2,Jeff Heaton,Bayesian Hyperparameter Optimization for PyTor...,2023-10-25,453,29,1,2023
3,Jeff Heaton,Creating Certificates to Deploy PyInstaller Py...,2023-10-17,439,12,0,2023
4,Jeff Heaton,How Should you Architect Your PyTorch Neural N...,2023-10-12,825,39,1,2023


### videos' titles analysis using NLTK

In [1]:
import nltk
nltk.download()