In [20]:
import pandas as pd
import isodate
from dateutil import parser

from googleapiclient.discovery import build
from IPython.display import JSON

from config.config import YOUTUBE_API_KEY
from api_methods import collect_channel_stats, collect_video_ids, collect_video_details

In [2]:
# Get credentials and create an API client
api_service_name = "youtube"
api_version = "v3"

youtube = build(api_service_name, api_version, developerKey=YOUTUBE_API_KEY)

In [3]:
# Channels to analyze
channel_ids = [
    "UCYO_jab_esuFRV4b17AJtAw",
    # Can add more channels
]

In [4]:
channel_stats = collect_channel_stats(youtube, channel_ids)

In [12]:
channel_stats.head()

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,3Blue1Brown,5720000,408569095,156,UUYO_jab_esuFRV4b17AJtAw


In [6]:
playlist_id = channel_stats.loc[channel_stats['channelName']== "3Blue1Brown", "playlistId"].iloc[0]

In [7]:
video_ids = collect_video_ids(youtube, playlist_id)

In [8]:
len(video_ids)

157

In [23]:
video_df = collect_video_details(youtube, video_ids)

In [24]:
video_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,tjIOqIr80ns,3Blue1Brown,The limit of limiting arguments,A link to the full video is at the bottom of t...,"[Mathematics, three blue one brown, 3 blue 1 b...",2023-12-23T00:00:26Z,485539,27741,,239,PT51S,hd,False
1,W1gW1dHRsOw,3Blue1Brown,For anyone who might not know how links in sho...,YouTube disabled links in descriptions and com...,"[Mathematics, three blue one brown, 3 blue 1 b...",2023-12-21T00:15:02Z,145397,10902,,309,PT1M,hd,False
2,p9i3cYMQtBY,3Blue1Brown,Infinite Lighthouses and π,A link to the full video is at the bottom of t...,"[Mathematics, three blue one brown, 3 blue 1 b...",2023-12-21T00:00:20Z,601220,45647,,277,PT1M,hd,False
3,X4jpqCu-wlA,3Blue1Brown,Can you even imagine 2^256?,Originally written as a supplement to an expla...,"[Mathematics, three blue one brown, 3 blue 1 b...",2023-12-16T00:00:24Z,787794,49585,,465,PT1M,hd,False
4,GOSezO0CHss,3Blue1Brown,Order from chaos,A link to the full video on the Central Limit ...,"[Mathematics, three blue one brown, 3 blue 1 b...",2023-12-15T00:00:13Z,6960124,194125,,1187,PT50S,hd,False


In [25]:
# Check for NULL values
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool

In [26]:
# Check data types
video_df.dtypes

video_id          object
channelTitle      object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favouriteCount    object
commentCount      object
duration          object
definition        object
caption           object
dtype: object

In [27]:
# Convert count columns to numeric
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [28]:
# Find published day
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))

In [29]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [30]:
video_df[['durationSecs', 'duration']] 

Unnamed: 0,durationSecs,duration
0,0 days 00:00:51,PT51S
1,0 days 00:01:00,PT1M
2,0 days 00:01:00,PT1M
3,0 days 00:01:00,PT1M
4,0 days 00:00:50,PT50S
...,...,...
152,0 days 00:15:08,PT15M8S
153,0 days 00:07:27,PT7M27S
154,0 days 00:01:49,PT1M49S
155,0 days 00:01:04,PT1M4S
