# [YouTube Data API v3 reference](https://developers.google.com/youtube/v3/docs)
***
# Imports

In [37]:
%pip install psycopg2-binary


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
import os
from dotenv import load_dotenv
import requests
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import math
import psycopg2

In [79]:
# assume API calls are made at the same exact time

script_timestamp = datetime.now(timezone.utc) # YT API uses UTC timezone
script_timestamp

datetime.datetime(2024, 4, 24, 19, 55, 36, 38846, tzinfo=datetime.timezone.utc)

***
# Get API key and DB password

In [39]:
# Load environment variables from .env file
load_dotenv("environment_variables.env")

True

In [40]:
api_key = os.getenv("API_KEY")
psql_pw = os.getenv("PSQL_PW")

***
# Database Connection

In [41]:
host = "youtubeviewprediction.cd0c8oow2pnr.us-east-1.rds.amazonaws.com"
port = 5432
database = "postgres"
user = "postgres"

In [42]:
try:
    # Connect to the PostgreSQL database
    connection = psycopg2.connect(
        host=host,
        port=port,
        database=database,
        user=user,
        password=psql_pw
    )

    # Create a cursor object using the cursor() method
    cursor = connection.cursor()

    # Execute a SQL query
    cursor.execute("SELECT version();")

    # Fetch result
    record = cursor.fetchone()
    print("You are connected to - ", record, "\n")

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to PostgreSQL", error)

You are connected to -  ('PostgreSQL 16.1 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 7.3.1 20180712 (Red Hat 7.3.1-12), 64-bit',) 



***
# Most popular videos
## Extract

API limits 200 videos per call across 4 pages.

In [43]:
videos_api_url = "https://www.googleapis.com/youtube/v3/videos"

In [44]:
def parse_video_json(video):

    video_id = video["id"]

    # snippet
    publish_datetime = video["snippet"]["publishedAt"]
    channel_id = video["snippet"]["channelId"]
    video_title = video["snippet"]["title"]
    video_description = video["snippet"]["description"]
    channel_title = video["snippet"]["channelTitle"]
    num_tags = len(video["snippet"].get("tags", [])) # can be null
    category_id = video["snippet"]["categoryId"]

    # contentDetails
    duration = video["contentDetails"]["duration"]
    licensed_content = video["contentDetails"]["licensedContent"]

    # status
    made_for_kids = video["status"]["madeForKids"]

    # statistics
    num_views = video["statistics"]["viewCount"]
    num_likes = video["statistics"].get("likeCount", None) # can be null
    num_comments = video["statistics"].get("commentCount", None) # can be null

    return [video_id, 
            publish_datetime, 
            channel_id, 
            video_title,
            video_description,
            channel_title,
            num_tags,
            category_id, 
            duration, 
            licensed_content, 
            made_for_kids, 
            num_views, 
            num_likes, 
            num_comments]    

In [45]:
video_df = pd.DataFrame(columns=["script_timestamp", 
                                 "video_id", 
                                 "publish_datetime", 
                                 "channel_id",
                                 "video_title",
                                 "video_description",
                                 "channel_title",
                                 "num_tags", 
                                 "category_id",  
                                 "duration", 
                                 "licensed_content", 
                                 "made_for_kids", 
                                 "num_views", 
                                 "num_likes", 
                                 "num_comments"])
video_df

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments


In [46]:
pageToken = ""

while pageToken is not None:
    params = {
        "key": api_key,
        "part": "id, snippet, contentDetails, status, statistics",
        "chart": "mostPopular",
        "hl": "en",
        "regionCode": "US",
        "maxResults": 50,
        "pageToken": pageToken
    }
    
    response = requests.get(videos_api_url, params=params)

    if response.status_code == 200:
        response_json = response.json()

        for video in response_json["items"]:
            # add video details and datetime of request to end of video dataframe
            video_df.loc[len(video_df)] = [script_timestamp] + parse_video_json(video)

        # get nextPageToken with null safety
        pageToken = response_json.get("nextPageToken", None)

    else:
        print(f"response.status_code = {response.status_code}")
        pageToken = None
    

In [47]:
video_df

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments
0,2024-04-24 19:09:47.072439+00:00,cen0rBKLuYE,2024-04-22T12:55:17Z,UCA3-nIYWu4PTWkb6NwhEpzg,Deadpool & Wolverine | Trailer,Found the guy who killed Bambi’s mom. #Deadpoo...,Ryan Reynolds,11,24,PT2M39S,True,False,26234383,1317506,55009
1,2024-04-24 19:09:47.072439+00:00,u5CSHhLkTdQ,2024-04-24T00:28:28Z,UC7yRILFFJ2QZCykymr8LPwA,DEADPOOL & WOLVERINE TRAILER BREAKDOWN! Easter...,Deadpool & Wolverine Trailer frame-by-frame br...,New Rockstars,3,24,PT33M37S,True,False,664298,34175,1583
2,2024-04-24 19:09:47.072439+00:00,UkCIzRKKEao,2024-04-23T18:01:53Z,UC7zsxKqd5MicTf4VhS9Y74g,Fitness Influencers Pretending to be Monkeys,💻 Thanks to Opera for sponsoring this video! G...,Kurtis Conner,2,23,PT21M23S,True,False,922724,81330,5152
3,2024-04-24 19:09:47.072439+00:00,4fn7c1s7J5s,2024-04-23T19:00:02Z,UCdLf5_x-4eic2Hw-uovjdKA,Rick Ross - Champagne Moments (Official Music ...,Official music video for Champagne Moments\nAv...,RickRossVEVO,14,10,PT3M34S,True,False,3106658,40453,5423
4,2024-04-24 19:09:47.072439+00:00,U6UpFiaD7Po,2024-04-23T04:46:26Z,UCWJ2lWNubArHWmf3FIHbfcQ,#7 LAKERS at #2 NUGGETS | FULL GAME 2 HIGHLIGH...,"Never miss a moment with the latest news, tren...",NBA,4,17,PT9M59S,True,False,2140464,24189,5888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-04-24 19:09:48.784389+00:00,tFPcx4X9-e8,2024-04-19T13:00:50Z,UC0AE_22J0kAo30yjasaeFqw,VILLAGER NEWS: BREAKING NEWS! 🎵🎵🎵Minecraft Ani...,Shockbyte - https://www.shockbyte.com/panel\nT...,Element Animation,0,20,PT7M17S,True,False,894091,101246,5408
196,2024-04-24 19:09:48.784389+00:00,xZP-qqwsGns,2024-04-19T04:00:54Z,UCiIETTq2It3OzW8_WYRPTyQ,Russ - That's My Girl (Official Audio),"Russ - ""That's My Girl” (Official Audio)\n\n\n...",Russ,11,10,PT2M15S,True,False,359769,18133,580
197,2024-04-24 19:09:48.784389+00:00,erLbbextvlY,2024-03-30T16:00:01Z,UCX6OQ3DkcsbYNE6H8uQQuVA,7 Days Stranded On An Island,I can’t believe we actually did this\nSend mon...,MrBeast,0,24,PT22M26S,True,False,129451892,4993538,139113
198,2024-04-24 19:09:48.784389+00:00,Ih3RO7G-Rkw,2024-04-20T18:34:44Z,UCChcWqwYXCEs657MQ00qVWA,Manchester City v Chelsea | Key Moments | Semi...,Watch the key moments between Manchester City ...,The Emirates FA Cup,3,17,PT3M18S,True,False,3534484,35155,2134


In [48]:
video_df.isna().sum()

request_datetime     0
video_id             0
publish_datetime     0
channel_id           0
video_title          0
video_description    0
channel_title        0
num_tags             0
category_id          0
duration             0
licensed_content     0
made_for_kids        0
num_views            0
num_likes            0
num_comments         0
dtype: int64

## Transform

In [49]:
# fix datatypes
video_df.dtypes

request_datetime     datetime64[ns, UTC]
video_id                          object
publish_datetime                  object
channel_id                        object
video_title                       object
video_description                 object
channel_title                     object
num_tags                           int64
category_id                       object
duration                          object
licensed_content                    bool
made_for_kids                       bool
num_views                         object
num_likes                         object
num_comments                      object
dtype: object

In [50]:
video_df.sample(1)

Unnamed: 0,request_datetime,video_id,publish_datetime,channel_id,video_title,video_description,channel_title,num_tags,category_id,duration,licensed_content,made_for_kids,num_views,num_likes,num_comments
71,2024-04-24 19:09:47.931428+00:00,9rxIfPvcbmI,2024-04-22T01:40:12Z,UC3cpN6gcJQqcCM6mxRUo_dA,The Weirdest & Coolest Cryptids So Far,Use code WENDIGOON50 to get 50% OFF your first...,Wendigoon,55,24,PT44M51S,True,False,593660,35601,2757


### Turn publish_datetime into datetime

In [51]:
# turn publish_datetime into datetime
video_df["publish_datetime"] = pd.to_datetime(video_df["publish_datetime"], format="ISO8601")

### Parse duration in minutes from duration

In [52]:
video_df["duration"]

0       PT2M39S
1      PT33M37S
2      PT21M23S
3       PT3M34S
4       PT9M59S
         ...   
195     PT7M17S
196     PT2M15S
197    PT22M26S
198     PT3M18S
199    PT16M53S
Name: duration, Length: 200, dtype: object

In [53]:
video_df["duration"] = pd.to_timedelta(video_df["duration"]).apply(lambda x: x.seconds)
video_df.rename(columns={"duration": "duration_seconds"}, inplace=True)

### Turn num_view, num_likes, and num_comments into ints

In [54]:
video_df[["num_views", "num_likes", "num_comments"]].isna().sum()

num_views       0
num_likes       0
num_comments    0
dtype: int64

In [55]:
video_df[["num_views", "num_likes", "num_comments"]] = video_df[["num_views", "num_likes", "num_comments"]].fillna(0)
video_df[["num_views", "num_likes", "num_comments"]].isna().sum()

num_views       0
num_likes       0
num_comments    0
dtype: int64

In [56]:
video_df[["num_views", "num_likes", "num_comments"]] = video_df[["num_views", "num_likes", "num_comments"]].astype(dtype="int")

In [57]:
video_df.dtypes

request_datetime     datetime64[ns, UTC]
video_id                          object
publish_datetime     datetime64[ns, UTC]
channel_id                        object
video_title                       object
video_description                 object
channel_title                     object
num_tags                           int64
category_id                       object
duration_seconds                   int64
licensed_content                    bool
made_for_kids                       bool
num_views                          int64
num_likes                          int64
num_comments                       int64
dtype: object

In [77]:
video_df["cate"].apply(lambda x: len(x)).value_counts()

video_id
11    200
Name: count, dtype: int64

## Load

In [None]:
"""CREATE TABLE IF NOT EXISTS video_fact
(
 datetime     timestamp NOT NULL,
 video_id     varchar(11) NOT NULL,
 num_views    int NOT NULL,
 num_likes    int NOT NULL,
 num_comments int NOT NULL,
 CONSTRAINT PK_1 PRIMARY KEY ( datetime, video_id )
);"""

In [None]:
"""CREATE TABLE IF NOT EXISTS video_dim
(
 video_id      varchar(11) NOT NULL,
 channel_id    varchar(24) NOT NULL,
 title         varchar(100) NOT NULL,
 description   varchar(5000) NOT NULL,
 num_tags      int NOT NULL,
 duration_sec  int NOT NULL,
 licensed      boolean NOT NULL,
 made_for_kids boolean NOT NULL,
 published_at  timestamp NOT NULL,
 category_id   int NOT NULL,
 CONSTRAINT PK_1 PRIMARY KEY ( video_id )
);"""

***
# Channel metadata from top 200 videos
## Extract
API limits 50 channels per call with no pages.

In [63]:
unique_channel_ids = video_df["channel_id"].unique()

In [59]:
channels_api_url = "https://www.googleapis.com/youtube/v3/channels"

In [60]:
def parse_channel_json(channel):
    channel_id = channel["id"]

    # snippet
    channel_created_datetime = channel["snippet"]["publishedAt"]
    channel_name = channel["snippet"]["title"]

    # statistics
    channel_total_views = channel["statistics"].get("viewCount", None)
    channel_num_subscribers = channel["statistics"].get("subscriberCount", None)
    channel_num_videos = channel["statistics"].get("videoCount", None)

    return [channel_id, 
            channel_created_datetime,
            channel_name, 
            channel_total_views, 
            channel_num_subscribers,
            channel_num_videos]    

In [64]:
len(unique_channel_ids) % 50

33

In [65]:
# https://stackoverflow.com/questions/312443/how-do-i-split-a-list-into-equally-sized-chunks
def chunks(lst, n):
    return [lst[i:i + n] for i in range(0, len(lst), n)]

channels_chunked = chunks(unique_channel_ids, 50)
[len(chunk) for chunk in channels_chunked] # chunks = # API calls

[50, 50, 50, 33]

In [66]:
channel_df = pd.DataFrame(columns=["request_datetime", 
                                "id", 
                                "created_datetime", 
                                "name"
                                "total_views", 
                                "num_subscribers",
                                "num_videos"])
channel_df

Unnamed: 0,request_datetime,id,created_datetime,total_views,num_subscribers,num_videos


In [67]:
for channels in channels_chunked:
    params = {
        "key": api_key,
        "part": "id, snippet, statistics",
        "id": ", ".join(channels),
        "maxResults": 50,
    }

    response = requests.get(channels_api_url, params=params)

    if response.status_code == 200:
        response_json = response.json()

        for channel in response_json["items"]:
            # add channel details and datetime of request to end of video dataframe
            channel_df.loc[len(channel_df)] = [script_timestamp] + parse_channel_json(channel)

    else:
        print(f"response.status_code = {response.status_code}")

In [68]:
channel_df.shape

(183, 6)

## Transform

In [69]:
# fix datatypes
channel_df.dtypes

request_datetime    datetime64[ns, UTC]
id                               object
created_datetime                 object
total_views                      object
num_subscribers                  object
num_videos                       object
dtype: object

### Turn created_datetime into datetime

In [70]:
channel_df["created_datetime"] = pd.to_datetime(channel_df["created_datetime"], format="ISO8601")

### Turn total_views, num_subscribers, num_videos into ints

In [71]:
channel_df[["total_views", "num_subscribers", "num_videos"]].isna().sum()

total_views        0
num_subscribers    0
num_videos         0
dtype: int64

In [72]:
channel_df[["total_views", "num_subscribers", "num_videos"]] = channel_df[["total_views", "num_subscribers", "num_videos"]].fillna(0)
channel_df[["total_views", "num_subscribers", "num_videos"]].isna().sum()

total_views        0
num_subscribers    0
num_videos         0
dtype: int64

In [73]:
channel_df[["total_views", "num_subscribers", "num_videos"]] = channel_df[["total_views", "num_subscribers", "num_videos"]].astype(dtype="int")

In [74]:
channel_df.dtypes

request_datetime    datetime64[ns, UTC]
id                               object
created_datetime    datetime64[ns, UTC]
total_views                       int64
num_subscribers                   int64
num_videos                        int64
dtype: object

## Load

In [None]:
"""CREATE TABLE IF NOT EXISTS channel_dim
(
 channel_id varchar(24) NOT NULL,
 name       varchar(36) NOT NULL,
 created_at timestamp NOT NULL,
 CONSTRAINT PK_1 PRIMARY KEY ( channel_id )
);"""

In [None]:

"""CREATE TABLE IF NOT EXISTS channel_fact
(
 datetime        timestamp NOT NULL,
 channel_id      varchar(24) NOT NULL,
 total_views     int NOT NULL,
 num_subscribers int NOT NULL,
 num_videos      int NOT NULL,
 CONSTRAINT PK_1 PRIMARY KEY ( datetime, channel_id )
);"""

# Video Categories

In [83]:
videoCategories_api_url = "https://www.googleapis.com/youtube/v3/videoCategories"

## Extract

In [80]:
def parse_videoCategories_json(category):
    id = category["id"]
    name = category["snippet"]["title"]

    return [id, name]

In [82]:
videoCategories_df = pd.DataFrame(columns=["id","name"])
videoCategories_df

Unnamed: 0,id,name


In [None]:
params = {
    "key": api_key,
    "part": "snippet",
    "regionCode": "US",
}

response = requests.get(videoCategories_api_url, params=params)

if response.status_code == 200:
    response_json = response.json()

    for category in response_json["items"]:
        # add channel details and datetime of request to end of video dataframe
        videoCategories_df.loc[len(videoCategories_df)] = parse_videoCategories_json(category)

else:
    print(f"response.status_code = {response.status_code}")

In [84]:
videoCategories_df

Unnamed: 0,id,name
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports
5,18,Short Movies
6,19,Travel & Events
7,20,Gaming
8,21,Videoblogging
9,22,People & Blogs


In [87]:
# all categories accounted for
set(videoCategories_df["id"]).intersection(set(video_df["category_id"])) == set(video_df["category_id"])

True

In [89]:
videoCategories_df["name"].apply(lambda x: len(x)).max()

21

## Load

In [None]:
"""CREATE TABLE IF NOT EXISTS categories_dim
(
 category_id int NOT NULL,
 name        varchar(21) NOT NULL,
 CONSTRAINT PK_1 PRIMARY KEY ( category_id )
);"""