### Aquiring Data

In [30]:
# Required modules to run script 
import pandas as pd
import numpy as np
import datetime as dt
from googleapiclient import discovery
from embetter.text import SentenceEncoder
import umap.umap_ as umap
import matplotlib.pyplot as plt
from bulk import BaseTextExplorer 

In [3]:
# Connecting to YouTube API 
api_key = 'AIzaSyAWAr08qr57GHZLlXIkEr0KGJ3VbpA4tBE' #linked to Dylan's google account
youtube = discovery.build('youtube', 'v3', developerKey = api_key)

In [4]:
# Creating function to get video specific metadata
def get_video_details(videoID):
    # Accesing video meta data
    request = youtube.videos().list(part = 'snippet, contentDetails, statistics',
                                     id = videoID)
    response = request.execute()

    # Creating list to collect instances
    data = {'title' : response['items'][0]['snippet']['title'],
            'description' : response['items'][0]['snippet']['description'],
            'times_viewed' : response['items'][0]['statistics']['viewCount'],
            'times_liked' : response['items'][0]['statistics']['likeCount'],
            'times_faved' : response['items'][0]['statistics']['favoriteCount'],
            'duration' : response['items'][0]['contentDetails']['duration']
           }
    
    return data

In [5]:
# Function to get specific video data
def get_video_data(videoID):
    try:
        request_video = youtube.videos().list(part = 'snippet, contentDetails, statistics',
                                             id = videoID).execute()
            
        title = request_video['items'][0]['snippet']['title']
        description = request_video['items'][0]['snippet']['description']
        views = request_video['items'][0]['statistics']['viewCount']
        likes = request_video['items'][0]['statistics']['likeCount']
        favs = request_video['items'][0]['statistics']['favoriteCount']
        tags = request_video['items'][0]['snippet']['tags']
        comment_count = request_video['items'][0]['statistics']['commentCount']
        duration = request_video['items'][0]['contentDetails']['duration']
            
        return tags, comment_count, title, description, views, likes, favs, duration
        
    except (IndexError, KeyError) as e:
        return "Error:" + str(e)

In [6]:
# Getting Top level comments from each video
def retrieve_comments(videoID):
    request_comments = youtube.commentThreads().list(part = 'snippet',
                                     videoId = videoID).execute()
    comments_retrieved = []
    for element in request_comments['items']:
        comment = element['snippet']['topLevelComment']['snippet']['textDisplay'], element['snippet']['topLevelComment']['snippet']['likeCount']
        comments_retrieved.append(comment)
    return comments_retrieved

In [7]:
# Compiling data
def access_all_videos(channel_id):
    request_channel = youtube.channels().list(part = 'snippet, contentDetails, statistics', 
                                  id = channel_id).execute()
    data = dict(channel_name = request_channel['items'][0]['snippet']['title'], 
                subscribers = request_channel['items'][0]['statistics']['subscriberCount'],
                views =  request_channel['items'][0]['statistics']['viewCount'],
                upload_key = request_channel['items'][0]['contentDetails']['relatedPlaylists']['uploads'])
    
    # Collecting videoIDs
    all_videos = []
    pageToken = ""
    while True:
        videos = youtube.playlistItems().list(part = 'contentDetails', 
                                              maxResults = 50, 
                                              pageToken = pageToken if pageToken != None else None, 
                                              playlistId = data['upload_key']).execute()
        v = videos.get('items', [])
        if v:
            all_videos.extend(v)
        pageToken = videos.get('nextPageToken')
        if not pageToken:
            break

    # Building dataset
    yt_df = pd.DataFrame(columns =['videoId', 'date published', 'tags', 'comment count',
                                  'title', 'description', 'views', 'likes', 'favs', 'duration'])
    yt_df['videoId'] = [x['contentDetails']['videoId'] for x in all_videos]
    yt_df['date published'] = [x['contentDetails']['videoPublishedAt'] for x in all_videos]
    columns_getvideodata = ['tags', 'comment count', 'title', 'description', 'views', 'likes', 'favs', 'duration']
    yt_df[columns_getvideodata] = yt_df['videoId'].apply(lambda videoID: pd.Series(get_video_data(videoID)))
    yt_df['Top level Comments, comment likes'] = yt_df['videoId'].apply(lambda videoID: retrieve_comments(videoID))
    return data, yt_df

data, yt_df = access_all_videos('UChgazymXzLErffnVsfchhsg')

In [8]:
yt_df

Unnamed: 0,videoId,date published,tags,comment count,title,description,views,likes,favs,duration,"Top level Comments, comment likes"
0,7_gkpAHjMiA,2024-11-28T04:36:58Z,"[Caravans by the campfire, should I buy a van,...",8,I am the Dishwasher! Or maybe not!! #everythin...,Who needs a dishwasher in their caravan? Stran...,718,22,0,PT36S,[(I have a wife and when we go away in the car...
1,VbgemI5hVHA,2024-11-27T04:00:16Z,"[Caravans by the campfire, should I buy a van,...",50,Zone RV Summit - Does it tick all the boxes?,In this episode we celebrate our channel with ...,5587,308,0,PT22M6S,[(&quot;Includes Paid Promotion&quot;? Does th...
2,PPO_lOCVZEE,2024-11-08T02:19:12Z,"[Caravans by the campfire, should I buy a van,...",0,Do I really need a Caravan?,Check out this episode as review a real option...,1168,48,0,PT45S,[]
3,5n0vODp0EXg,2024-11-07T04:00:03Z,"[Caravans by the campfire, should I buy a van,...",26,Upgrade to a Campervan! Do I really need that ...,What if you don't need your caravan anymore? O...,5296,253,0,PT26M12S,"[(Nice review, we tried a motorhome and went b..."
4,98y-tXExoh8,2024-10-29T06:24:39Z,"[Caravans by the campfire, should I buy a van,...",5,I don't feel Australian without one #everythin...,Sometimes you just have to have something to f...,1038,26,0,PT30S,"[(Sandy taupe 👌, 1), (I&#39;ve had my 76 since..."
...,...,...,...,...,...,...,...,...,...,...,...
205,OA4hfU2ePuc,2022-11-09T10:06:34Z,"[#shorts, campfire tips, campfire start, start...",5,Campfire Tips #2 Start'er Up,Tip number 2 for all those out there strugglin...,769,26,0,PT1M10S,"[(Love your humour, great tips! Keep the tips ..."
206,LXfejdIf2Xw,2022-11-04T10:33:34Z,"[Zone RV Caravans, Zone Caravan Review, Carava...",59,Zone Caravans - Are they really that good?,We are on a journey to find a new caravan - fi...,52280,513,0,PT34M4S,"[(Hey guys, love love watching your journeys a..."
207,xV0DI5DEWCY,2022-10-24T07:22:54Z,Error:'tags',,,,,,,,"[(Great tip that I have taken on board, we use..."
208,YKYQYjufu8w,2022-10-15T09:12:03Z,Error:'tags',,,,,,,,[(Splitter is a great idea and being all plug ...


### Analysis

In [42]:
all_comments = []
for x in yt_df['Top level Comments, comment likes']:
    for y in x:
        all_comments.append(y[0])

encoder = SentenceEncoder()
X = encoder.fit_transform(all_comments)

# Apply UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist = 0.1, n_components=2)
umap_df = pd.DataFrame(umap_model.fit_transform(X), columns=['x', 'y'])

# Visualize the results with BaseTextExplorer
output = BaseTextExplorer(umap_df, X=X, encoder=encoder)

In [44]:
output

HBox(children=(VBox(children=(Text(value='g', description='String:', placeholder='Type something'), HBox(child…

<bulk.BaseTextExplorer at 0x28a2ac66200>