In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import os
#import google_auth_oauthlib.flow
from googleapiclient.discovery import build
import googleapiclient.errors
from IPython.display import JSON

In [3]:
channel_id = 'UUwQnoax3HWID1WOzZ4mqLPQ'

In [4]:
import boto3
import requests

In [5]:
def get_parameter(param_name):
    ssm = boto3.client('ssm',region_name='us-east-2')
    response = ssm.get_parameter(Name=param_name, WithDecryption=True)
    return response['Parameter']['Value']

In [6]:
api_key = get_parameter('yt_api')

In [7]:
#Testing the YT API 
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id =channel_id
)
response = request.execute()

JSON(response)

<IPython.core.display.JSON object>

In [8]:
def get_video_ids(youtube,channel_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId = channel_id,
        maxResults = 50
    )
    
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
                    part = 'contentDetails',
                    playlistId = channel_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        response = request.execute()
        
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
            
        next_page_token = response.get('nextPageToken')
        
    return video_ids

In [9]:
video_ids = get_video_ids(youtube, channel_id)
len(video_ids)

497

In [10]:
#Extract Video information based on vid id
def get_video_details(youtube, video_ids):
    
    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )

        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle','publishedAt','title','description','categoryId','tags'],
                             'statistics':['viewCount','likeCount','commentCount'],
                             'contentDetails':['duration','dimenstion','caption']}

            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [11]:
channel_df = get_video_details(youtube, video_ids)

In [12]:
channel_df.shape

(497, 13)

In [14]:
channel_df.head(15)

Unnamed: 0,video_id,channelTitle,publishedAt,title,description,categoryId,tags,viewCount,likeCount,commentCount,duration,dimenstion,caption
0,et4UBoECM8o,Cole Hastings,2024-11-24T17:00:37Z,The Population Collapse: Why No One Wants Kids...,Protect yourself from misinformation and impro...,22,"[cole hastings, population collapse, populatio...",90720,3137,2284,PT16M55S,,False
1,wuYXU7_6Eac,Cole Hastings,2024-11-17T17:00:56Z,Why Aren't Young Men Going To College?,"During this holiday season, get up to 50% off ...",22,"[cole hastings, gen z men, gen z men republica...",43818,1982,595,PT15M17S,,False
2,vMOnkfV_oJQ,Cole Hastings,2024-11-09T17:00:56Z,Following your dreams won't make you happy.,The first 500 people to use my link will get a...,22,"[cole hastings, follow your dreams, dream life...",20527,1372,145,PT18M16S,,False
3,TCyFXCi_klw,Cole Hastings,2024-10-06T16:00:34Z,Why Gen Z Hates Capitalism,Capitalism is often blamed for all modern issu...,22,"[cole hastings, capitalism, gen z capitalism, ...",60139,2735,1021,PT25M2S,,False
4,cbCcjT9mQcE,Cole Hastings,2024-09-28T16:00:37Z,Why Is Gen Z Obsessed With Nostalgia?,Thanks to Graza for sponsoring this video! Get...,22,"[cole hastings, nostalgia, why gen z loves nos...",138374,5723,1413,PT16M53S,,False
5,NBF0sHIkCtk,Cole Hastings,2024-09-22T16:00:21Z,5 Ways To Cure Brain Rot,Get $50 off the best video editing templates b...,22,"[cole hastings, brainrot, how to stop brainrot...",37651,2434,185,PT22M49S,,False
6,1LRbmrEOhfI,Cole Hastings,2024-09-15T16:00:01Z,The Trust Crisis: Why Everyone Feels Fake Now,Get a 30-day free Sunsama trial and you don’t ...,22,"[cole hastings, trust epidemic, trust crisis, ...",33248,1500,358,PT19M2S,,False
7,oK7LbxFqYiE,Cole Hastings,2024-09-07T16:00:47Z,Boomers were right. Life is simple.,Buy a Space One Pro and get its dedicated trav...,22,"[cole hastings, simple living, boomers, gen z,...",91650,4244,362,PT16M5S,,False
8,Se891RuhhL0,Cole Hastings,2024-09-05T16:00:23Z,Gen Z Is Dealing With The Meaning Crisis Throu...,Get 240+ app subscriptions and try setapp for ...,22,"[cole hastings, gen z, gen z memes, nihilism, ...",55381,1916,246,PT14M9S,,False
9,oK5BseekKLs,Cole Hastings,2024-08-25T16:00:48Z,The Death Of The Middle Class: Why Everyone Fe...,Visit my sponsor https://zocdoc.com/cole to ea...,22,"[cole hastings, why the middle class is dying,...",199713,7835,1959,PT14M56S,,False


### Topic Modelling 

In [45]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annettetamakloe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annettetamakloe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [50]:
def process_title(title):
    title = title.lower()
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    tokens = title.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [51]:
channel_df['preprocessed_titles'] = channel_df['title'].apply(process_title)

In [53]:
channel_df.head(15)

Unnamed: 0,video_id,channelTitle,publishedAt,title,description,categoryId,tags,viewCount,likeCount,commentCount,duration,dimenstion,caption,preprocessed_titles
0,et4UBoECM8o,Cole Hastings,2024-11-24T17:00:37Z,The Population Collapse: Why No One Wants Kids...,Protect yourself from misinformation and impro...,22,"[cole hastings, population collapse, populatio...",90720,3137,2284,PT16M55S,,False,population collapse one want kid anymore
1,wuYXU7_6Eac,Cole Hastings,2024-11-17T17:00:56Z,Why Aren't Young Men Going To College?,"During this holiday season, get up to 50% off ...",22,"[cole hastings, gen z men, gen z men republica...",43818,1982,595,PT15M17S,,False,arent young men going college
2,vMOnkfV_oJQ,Cole Hastings,2024-11-09T17:00:56Z,Following your dreams won't make you happy.,The first 500 people to use my link will get a...,22,"[cole hastings, follow your dreams, dream life...",20527,1372,145,PT18M16S,,False,following dream wont make happy
3,TCyFXCi_klw,Cole Hastings,2024-10-06T16:00:34Z,Why Gen Z Hates Capitalism,Capitalism is often blamed for all modern issu...,22,"[cole hastings, capitalism, gen z capitalism, ...",60139,2735,1021,PT25M2S,,False,gen z hate capitalism
4,cbCcjT9mQcE,Cole Hastings,2024-09-28T16:00:37Z,Why Is Gen Z Obsessed With Nostalgia?,Thanks to Graza for sponsoring this video! Get...,22,"[cole hastings, nostalgia, why gen z loves nos...",138374,5723,1413,PT16M53S,,False,gen z obsessed nostalgia
5,NBF0sHIkCtk,Cole Hastings,2024-09-22T16:00:21Z,5 Ways To Cure Brain Rot,Get $50 off the best video editing templates b...,22,"[cole hastings, brainrot, how to stop brainrot...",37651,2434,185,PT22M49S,,False,5 way cure brain rot
6,1LRbmrEOhfI,Cole Hastings,2024-09-15T16:00:01Z,The Trust Crisis: Why Everyone Feels Fake Now,Get a 30-day free Sunsama trial and you don’t ...,22,"[cole hastings, trust epidemic, trust crisis, ...",33248,1500,358,PT19M2S,,False,trust crisis everyone feel fake
7,oK7LbxFqYiE,Cole Hastings,2024-09-07T16:00:47Z,Boomers were right. Life is simple.,Buy a Space One Pro and get its dedicated trav...,22,"[cole hastings, simple living, boomers, gen z,...",91650,4244,362,PT16M5S,,False,boomer right life simple
8,Se891RuhhL0,Cole Hastings,2024-09-05T16:00:23Z,Gen Z Is Dealing With The Meaning Crisis Throu...,Get 240+ app subscriptions and try setapp for ...,22,"[cole hastings, gen z, gen z memes, nihilism, ...",55381,1916,246,PT14M9S,,False,gen z dealing meaning crisis meme
9,oK5BseekKLs,Cole Hastings,2024-08-25T16:00:48Z,The Death Of The Middle Class: Why Everyone Fe...,Visit my sponsor https://zocdoc.com/cole to ea...,22,"[cole hastings, why the middle class is dying,...",199713,7835,1959,PT14M56S,,False,death middle class everyone feel broke


### LDA

In [54]:
from gensim import corpora, models

In [55]:
channel_df.columns

Index(['video_id', 'channelTitle', 'publishedAt', 'title', 'description',
       'categoryId', 'tags', 'viewCount', 'likeCount', 'commentCount',
       'duration', 'dimenstion', 'caption', 'preprocessed_titles'],
      dtype='object')

In [57]:
dictionary = corpora.Dictionary(channel_df['preprocessed_titles'].str.split())
corpus = [dictionary.doc2bow(title.split()) for title in channel_df['preprocessed_titles']]

In [59]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x17ee4f590>

In [63]:
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary,
                            passes=15, num_topics=10)

topics = lda_model.print_topics()

In [64]:
for topic in topics:
    print(topic)

(0, '0.023*"vegan" + 0.021*"weight" + 0.018*"body" + 0.018*"muscle" + 0.017*"day" + 0.017*"life" + 0.017*"workout" + 0.016*"lose" + 0.015*"eats" + 0.014*"build"')
(1, '0.049*"vegan" + 0.026*"muscle" + 0.021*"meal" + 0.020*"protein" + 0.019*"building" + 0.018*"day" + 0.016*"video" + 0.015*"high" + 0.013*"one" + 0.011*"every"')
(2, '0.045*"workout" + 0.020*"life" + 0.014*"full" + 0.013*"routine" + 0.013*"mass" + 0.011*"5" + 0.011*"year" + 0.010*"day" + 0.010*"body" + 0.009*"home"')
(3, '0.014*"day" + 0.014*"life" + 0.014*"episode" + 0.010*"routine" + 0.010*"suffering" + 0.010*"morning" + 0.010*"epidemic" + 0.010*"advice" + 0.010*"form" + 0.010*"carbs"')
(4, '0.090*"vegan" + 0.057*"day" + 0.036*"bodybuilder" + 0.029*"life" + 0.024*"full" + 0.023*"eating" + 0.019*"3" + 0.015*"eat" + 0.013*"calorie" + 0.013*"bodybuilding"')
(5, '0.020*"gen" + 0.020*"z" + 0.011*"muscle" + 0.010*"never" + 0.010*"first" + 0.010*"guide" + 0.010*"nothing" + 0.007*"making" + 0.007*"time" + 0.007*"like"')
(6, '0.0

In [65]:
def get_dominant_topic(doc):
    bow = dictionary.doc2bow(doc.split())
    topics = lda_model[bow]
    if topics:
        return topics[0][0]
    else:
        return None

In [66]:
channel_df['topic'] = channel_df['preprocessed_titles'].apply(get_dominant_topic)

In [67]:
channel_df

Unnamed: 0,video_id,channelTitle,publishedAt,title,description,categoryId,tags,viewCount,likeCount,commentCount,duration,dimenstion,caption,preprocessed_titles,topic
0,et4UBoECM8o,Cole Hastings,2024-11-24T17:00:37Z,The Population Collapse: Why No One Wants Kids...,Protect yourself from misinformation and impro...,22,"[cole hastings, population collapse, populatio...",90720,3137,2284,PT16M55S,,False,population collapse one want kid anymore,0
1,wuYXU7_6Eac,Cole Hastings,2024-11-17T17:00:56Z,Why Aren't Young Men Going To College?,"During this holiday season, get up to 50% off ...",22,"[cole hastings, gen z men, gen z men republica...",43818,1982,595,PT15M17S,,False,arent young men going college,0
2,vMOnkfV_oJQ,Cole Hastings,2024-11-09T17:00:56Z,Following your dreams won't make you happy.,The first 500 people to use my link will get a...,22,"[cole hastings, follow your dreams, dream life...",20527,1372,145,PT18M16S,,False,following dream wont make happy,0
3,TCyFXCi_klw,Cole Hastings,2024-10-06T16:00:34Z,Why Gen Z Hates Capitalism,Capitalism is often blamed for all modern issu...,22,"[cole hastings, capitalism, gen z capitalism, ...",60139,2735,1021,PT25M2S,,False,gen z hate capitalism,0
4,cbCcjT9mQcE,Cole Hastings,2024-09-28T16:00:37Z,Why Is Gen Z Obsessed With Nostalgia?,Thanks to Graza for sponsoring this video! Get...,22,"[cole hastings, nostalgia, why gen z loves nos...",138374,5723,1413,PT16M53S,,False,gen z obsessed nostalgia,0
5,NBF0sHIkCtk,Cole Hastings,2024-09-22T16:00:21Z,5 Ways To Cure Brain Rot,Get $50 off the best video editing templates b...,22,"[cole hastings, brainrot, how to stop brainrot...",37651,2434,185,PT22M49S,,False,5 way cure brain rot,0
6,1LRbmrEOhfI,Cole Hastings,2024-09-15T16:00:01Z,The Trust Crisis: Why Everyone Feels Fake Now,Get a 30-day free Sunsama trial and you don’t ...,22,"[cole hastings, trust epidemic, trust crisis, ...",33248,1500,358,PT19M2S,,False,trust crisis everyone feel fake,0
7,oK7LbxFqYiE,Cole Hastings,2024-09-07T16:00:47Z,Boomers were right. Life is simple.,Buy a Space One Pro and get its dedicated trav...,22,"[cole hastings, simple living, boomers, gen z,...",91650,4244,362,PT16M5S,,False,boomer right life simple,0
8,Se891RuhhL0,Cole Hastings,2024-09-05T16:00:23Z,Gen Z Is Dealing With The Meaning Crisis Throu...,Get 240+ app subscriptions and try setapp for ...,22,"[cole hastings, gen z, gen z memes, nihilism, ...",55381,1916,246,PT14M9S,,False,gen z dealing meaning crisis meme,0
9,oK5BseekKLs,Cole Hastings,2024-08-25T16:00:48Z,The Death Of The Middle Class: Why Everyone Fe...,Visit my sponsor https://zocdoc.com/cole to ea...,22,"[cole hastings, why the middle class is dying,...",199713,7835,1959,PT14M56S,,False,death middle class everyone feel broke,0


### BERT Topic

In [72]:
import sys
print(sys.path)

['/Users/annettetamakloe/Documents/Projects/YT_analysis/CH_Analysis', '/opt/anaconda3/envs/new_conda/lib/python312.zip', '/opt/anaconda3/envs/new_conda/lib/python3.12', '/opt/anaconda3/envs/new_conda/lib/python3.12/lib-dynload', '', '/opt/anaconda3/envs/new_conda/lib/python3.12/site-packages']


In [1]:
from bertopic import BERTopic
import boto3

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_parameter(param_name):
  ssm = boto3.client('ssm',region_name='us-east-2')
  response = ssm.get_parameter(Name=param_name, WithDecryption=True)
  return response['Parameter']['Value'] 

In [3]:
api_yt = get_parameter('yt_api')

SSLError: SSL validation failed for https://ssm.us-east-2.amazonaws.com/ hostname 'ssm.us-east-2.amazonaws.com' doesn't match either of 'sdptpsiproxyvip.charter.com', 'activate.spectrum.net', 'api.spectrum.net', 'apis.spectrum.net', 'apis.stage-spectrum.net', 'cdn.pi.spectrum.net', 'collector.pi.spectrum.net', 'id.spectrum.net', 'id.stage-spectrum.net', 'pi-lite.spectrum.net', 'sdpetpsi.charter.com', 'sdpetpsi-a.charter.com', 'sdpetpsi-b.charter.com', 'sdptpsi.charter.com', 'sdptpsi.g.charter.com', 'sdptpsi-east.charter.com', 'sdptpsi-west.charter.com', 'sdptpsi-west-a.charter.com', 'sdptpsi-west-b.charter.com', 'sdpwtpsi.charter.com', 'sdpwtpsi-a.charter.com', 'sdpwtpsi-b.charter.com', 'spectrum.net', 'tpsi.spectrum.net', 'tpsi-ort.spectrum.net', 'v-collector.dp.aws.charter.com', 'www.spectrum.com', 'www.spectrum.net', 'www.stage-spectrum.net', 'www.charter.com', 'si.spectrum.net', 'b.spectrum.net', 'h.spectrum.net', 'cs.spectrum.net', 'n.spectrum.net', 'ss.spectrum.net', 's.spectrum.net', 'st.spectrum.net', 'svc.spectrum.net', 'v.spectrum.net', 'pov.spectrum.net', 'stage-spectrum.net', 'pi-lite.shared.qa-spectrum.net', 'pi-lite.stage-spectrum.net', 'si.stage-spectrum.net', 'b.stage-spectrum.net', 'h.stage-spectrum.net', 'cs.stage-spectrum.net', 'n.stage-spectrum.net', 'ss.stage-spectrum.net', 's.stage-spectrum.net', 'st.stage-spectrum.net', 'svc.stage-spectrum.net', 'v.stage-spectrum.net'