# [YouTube Data API v3 reference](https://developers.google.com/youtube/v3/docs)
***
# Imports

In [130]:
import os
from dotenv import load_dotenv
import requests
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import math

***
# Get API key

In [89]:
# Load environment variables from .env file
load_dotenv("yt_api_key.env")

True

In [90]:
api_key = os.getenv("API_KEY")

***
# Most popular videos
## Extract

API limits 200 videos per call across 4 pages.

In [None]:
videos_api_url = "https://www.googleapis.com/youtube/v3/videos"

In [91]:
def parse_video_json(video):

    video_id = video["id"]

    # snippet
    publish_datetime = video["snippet"]["publishedAt"]
    channel_id = video["snippet"]["channelId"]
    video_title = video["snippet"]["title"]
    video_description = video["snippet"]["description"]
    channel_title = video["snippet"]["channelTitle"]
    num_tags = len(video["snippet"].get("tags", [])) # can be null
    category_id = video["snippet"]["categoryId"]

    # contentDetails
    duration = video["contentDetails"]["duration"]
    licensed_content = video["contentDetails"]["licensedContent"]

    # status
    made_for_kids = video["status"]["madeForKids"]

    # statistics
    num_views = video["statistics"]["viewCount"]
    num_likes = video["statistics"].get("likeCount", None) # can be null
    num_comments = video["statistics"].get("commentCount", None) # can be null

    return [video_id, 
            publish_datetime, 
            channel_id, 
            video_title,
            video_description,
            channel_title,
            num_tags,
            category_id, 
            duration, 
            licensed_content, 
            made_for_kids, 
            num_views, 
            num_likes, 
            num_comments]    

In [92]:
video_df = pd.DataFrame(columns=["request_datetime", 
                                 "channel_id", 
                                 "publish_datetime", 
                                 "channel_id",
                                 "channel_title",
                                 "channel_description",
                                 "channel_title",
                                 "num_tags", 
                                 "category_id",  
                                 "duration", 
                                 "licensed_content", 
                                 "made_for_kids", 
                                 "num_views", 
                                 "num_likes", 
                                 "num_comments"])
video_df

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments


In [None]:
pageToken = ""

while pageToken is not None:
    params = {
        "key": api_key,
        "part": "id, snippet, contentDetails, status, statistics",
        "chart": "mostPopular",
        "hl": "en",
        "regionCode": "US",
        "maxResults": 50,
        "pageToken": pageToken
    }
    
    request_datetime = datetime.now(timezone.utc) # YT API uses UTC timezone
    response = requests.get(videos_api_url, params=params)

    if response.status_code == 200:
        response_json = response.json()

        for video in response_json["items"]:
            # add video details and datetime of request to end of video dataframe
            video_df.loc[len(video_df)] = [request_datetime] + parse_video_json(video)

        # get nextPageToken with null safety
        pageToken = response_json.get("nextPageToken", None)

    else:
        print(f"response.status_code = {response.status_code}")
        pageToken = None
    

In [94]:
video_df

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments
0,2024-04-23 20:19:25.669312+00:00,cen0rBKLuYE,2024-04-22T12:55:17Z,UCA3-nIYWu4PTWkb6NwhEpzg,Deadpool & Wolverine | Trailer,Found the guy who killed Bambi’s mom. #Deadpoo...,Ryan Reynolds,11,24,PT2M39S,True,False,22370042,1211589,51681
1,2024-04-23 20:19:25.669312+00:00,hN6C7cDNSQo,2024-04-23T05:12:59Z,UCU7iRrk3xfpUk0R6VdyC1Ow,Inside the NBA Reacts To Jamal Murray's Game-W...,Watch highlights from Inside the NBA with Shaq...,NBA on TNT,19,17,PT10M52S,True,False,921566,12960,3174
2,2024-04-23 20:19:25.669312+00:00,U6UpFiaD7Po,2024-04-23T04:46:26Z,UCWJ2lWNubArHWmf3FIHbfcQ,#7 LAKERS at #2 NUGGETS | FULL GAME 2 HIGHLIGH...,"Never miss a moment with the latest news, tren...",NBA,4,17,PT9M59S,True,False,1588798,20587,5289
3,2024-04-23 20:19:25.669312+00:00,3iFYULNTznI,2024-04-22T17:20:37Z,UCircJf-FlToBNjuy9xG-i5A,An Update,,Watcher,0,24,PT3M40S,False,False,1305959,98372,28692
4,2024-04-23 20:19:25.669312+00:00,lPuGVHsTMn0,2024-04-21T08:08:55Z,UCurvRE5fGcdUgCYWgh-BDsg,THREE KNOCKDOWNS | Devin Haney vs. Ryan Garcia...,"April 20, 2024 -- Devin Haney vs. Ryan Garcia ...",DAZN Boxing,12,17,PT11M25S,True,False,10107261,164296,24835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-04-23 20:19:27.364790+00:00,D2eNL_CrclY,2024-04-18T15:00:12Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends: Urban Assault Collection Event T...,Watch Newcastle and Wraith give you the play-b...,Apex Legends,27,20,PT1M29S,True,False,381754,11286,893
196,2024-04-23 20:19:27.364790+00:00,8UxRQOSFsuc,2024-04-17T20:15:00Z,UCxcTeAKWJca6XyJ37_ZoKIQ,Caitlin Clark Joins The Pat McAfee Show After ...,Welcome to The Pat McAfee Show LIVE from Noon-...,The Pat McAfee Show,26,17,PT19M45S,True,False,827716,17289,1453
197,2024-04-23 20:19:27.364790+00:00,3kurrR9K8cc,2024-04-18T11:00:03Z,UCgUvk6jVaf-1uKOqG8XNcaQ,2025 Toyota Camry | The Forever Car Done Right,We do a first drive review on the new 2025 Toy...,savagegeese,37,2,PT17M6S,True,False,291779,8747,1539
198,2024-04-23 20:19:27.364790+00:00,FYQHXZj4JB0,2024-04-18T01:33:02Z,UCsQBsZJltmLzlsJNG7HevBg,"Tyler, The Creator - RUNNING OUT OF TIME (feat...",,"Tyler, The Creator",0,2,PT4M,False,False,408867,39647,1622


In [95]:
video_df.isna().sum()

request_datetime     0
video_id             0
publish_datetime     0
channel_id           0
video_title          0
video_description    0
channel_title        0
num_tags             0
category_id          0
duration             0
licensed_content     0
made_for_kids        0
num_views            0
num_likes            0
num_comments         1
dtype: int64

## Transform

In [96]:
# fix datatypes
video_df.dtypes

request_datetime     datetime64[ns, UTC]
video_id                          object
publish_datetime                  object
channel_id                        object
video_title                       object
video_description                 object
channel_title                     object
num_tags                           int64
category_id                       object
duration                          object
licensed_content                    bool
made_for_kids                       bool
num_views                         object
num_likes                         object
num_comments                      object
dtype: object

In [97]:
video_df.sample(1)

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments
45,2024-04-23 20:19:25.669312+00:00,a6CtwgwuAcE,2024-04-22T14:26:50Z,UCs95gwav7frv5HTVqAGa7uQ,Deadpool & Wolverine | Official Trailer | In T...,Deadpool & Wolverine | Official Trailer | In T...,Tyrone Magnus,25,24,PT5M3S,True,False,155123,9371,439


### Turn publish_datetime into datetime

In [98]:
# turn publish_datetime into datetime
video_df["publish_datetime"] = pd.to_datetime(video_df["publish_datetime"], format="ISO8601")

### Parse duration in minutes from duration

In [99]:
video_df["duration"]

0       PT2M39S
1      PT10M52S
2       PT9M59S
3       PT3M40S
4      PT11M25S
         ...   
195     PT1M29S
196    PT19M45S
197     PT17M6S
198        PT4M
199     PT4M38S
Name: duration, Length: 200, dtype: object

In [100]:
video_df["duration"] = pd.to_timedelta(video_df["duration"]).apply(lambda x: x.seconds)
video_df.rename(columns={"duration": "duration_seconds"}, inplace=True)

### Turn num_view, num_likes, and num_comments into ints

In [101]:
video_df[["num_views", "num_likes", "num_comments"]].isna().sum()

num_views       0
num_likes       0
num_comments    1
dtype: int64

In [102]:
video_df[["num_views", "num_likes", "num_comments"]] = video_df[["num_views", "num_likes", "num_comments"]].fillna(0)
video_df[["num_views", "num_likes", "num_comments"]].isna().sum()

num_views       0
num_likes       0
num_comments    0
dtype: int64

In [103]:
video_df[["num_views", "num_likes", "num_comments"]] = video_df[["num_views", "num_likes", "num_comments"]].astype(dtype="int")

In [104]:
video_df.dtypes

request_datetime     datetime64[ns, UTC]
video_id                          object
publish_datetime     datetime64[ns, UTC]
channel_id                        object
video_title                       object
video_description                 object
channel_title                     object
num_tags                           int64
category_id                       object
duration_seconds                   int64
licensed_content                    bool
made_for_kids                       bool
num_views                          int64
num_likes                          int64
num_comments                       int64
dtype: object

## Load

***
# Channel metadata from top 200 videos
## Extract
API limits 50 channels per call with no pages.

In [107]:
unique_channel_ids = video_df["channel_id"].unique()
len(unique_channel_ids)

180

In [108]:
channels_api_url = "https://www.googleapis.com/youtube/v3/channels"

In [109]:
def parse_channel_json(channel):
    channel_id = channel["id"]

    # snippet
    channel_created_datetime = channel["snippet"]["publishedAt"]

    # statistics
    channel_total_views = channel["statistics"].get("viewCount", None)
    channel_num_subscribers = channel["statistics"].get("subscriberCount", None)
    channel_num_videos = channel["statistics"].get("videoCount", None)

    return [channel_id, 
            channel_created_datetime, 
            channel_total_views, 
            channel_num_subscribers,
            channel_num_videos]    

In [132]:
len(unique_channel_ids) % 50

30

In [136]:
# https://stackoverflow.com/questions/312443/how-do-i-split-a-list-into-equally-sized-chunks
def chunks(lst, n):
    return [lst[i:i + n] for i in range(0, len(lst), n)]

channels_chunked = chunks(unique_channel_ids, 50)
[len(chunk) for chunk in channels_chunked] # chunks = # API calls

[50, 50, 50, 30]

In [139]:
channel_df = pd.DataFrame(columns=["request_datetime", 
                                "id", 
                                "created_datetime", 
                                "total_views", 
                                "num_subscribers",
                                "num_videos"])
channel_df

Unnamed: 0,request_datetime,id,created_datetime,total_views,num_subscribers,num_videos


In [140]:
for channels in channels_chunked:
    params = {
        "key": api_key,
        "part": "id, snippet, statistics",
        "id": ", ".join(channels),
        "maxResults": 50,
    }

    request_datetime = datetime.now(timezone.utc) # YT API uses UTC timezone
    response = requests.get(channels_api_url, params=params)

    if response.status_code == 200:
        response_json = response.json()

        for channel in response_json["items"]:
            # add channel details and datetime of request to end of video dataframe
            channel_df.loc[len(channel_df)] = [request_datetime] + parse_channel_json(channel)

    else:
        print(f"response.status_code = {response.status_code}")

In [141]:
channel_df.shape

(180, 6)

## Transform

In [142]:
# fix datatypes
channel_df.dtypes

request_datetime    datetime64[ns, UTC]
id                               object
created_datetime                 object
total_views                      object
num_subscribers                  object
num_videos                       object
dtype: object

### Turn created_datetime into datetime

In [143]:
channel_df["created_datetime"] = pd.to_datetime(channel_df["created_datetime"], format="ISO8601")

### Turn total_views, num_subscribers, num_videos into ints

In [144]:
channel_df[["total_views", "num_subscribers", "num_videos"]].isna().sum()

total_views        0
num_subscribers    0
num_videos         0
dtype: int64

In [145]:
channel_df[["total_views", "num_subscribers", "num_videos"]] = channel_df[["total_views", "num_subscribers", "num_videos"]].fillna(0)
channel_df[["total_views", "num_subscribers", "num_videos"]].isna().sum()

total_views        0
num_subscribers    0
num_videos         0
dtype: int64

In [148]:
channel_df[["total_views", "num_subscribers", "num_videos"]] = channel_df[["total_views", "num_subscribers", "num_videos"]].astype(dtype="int")

In [149]:
channel_df.dtypes

request_datetime    datetime64[ns, UTC]
id                               object
created_datetime    datetime64[ns, UTC]
total_views                       int64
num_subscribers                   int64
num_videos                        int64
dtype: object

## Load