In [144]:
# Set up credentials and APIs

from configparser import ConfigParser
from algoliasearch.search_client import SearchClient

parser = ConfigParser()
_ = parser.read('secrets.cfg')

# Google API
GOOGLE_API_KEY = parser.get('google', 'api_key')

# Algolia API
ALGOLIA_APP_ID = parser.get('algolia', 'app_id')
ALGOLIA_API_KEY = parser.get('algolia', 'api_key')
ALGOLIA_INDEX_NAME = parser.get('algolia', 'index_name')

service = build('youtube', 'v3', developerKey=GOOGLE_API_KEY)

In [138]:
# Get Channels

channels = [
    'friendlyjordies',
]

channel_results = []
for channel in channels:
    channel_results.append(service.channels().list(
        part='snippet,statistics,contentDetails',
        forUsername=channel,
    ).execute()['items'][0])

In [119]:
# Get Video IDs

search_results = []

for channel in channel_results:
    video_id = service.search().list(
        part='snippet',
        channelId=channel['id'],
        maxResults='1',
        order='date'
    ).execute()['items'][0]['id']['videoId']
    
    search_results.append({
        'videoID': video_id,
        'part': None,
        'publishedAt': None,
        'channelId': channel['id'],
        'channelThumbnail': channel['snippet']['thumbnails']['default']['url'],
        'channelTitle': channel['snippet']['title'],
        'title': None,
        'description': None,
        'thumbnail': None,
        'tags': None,
        'captions': None,
        'category': None
    })

In [120]:
# Fetch VideoCategory List

categories = service.videoCategories().list(
    part='snippet',
    regionCode='au'
).execute()['items']

In [142]:
# For each video, fetch captions, metadata and thumbnail

from pytube import YouTube
import srt
import datetime

videos = []

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

for video in search_results:
    # Fetch Video Resource
    videoResource = service.videos().list(
        part='id,snippet,statistics,topicDetails',
        id=video['videoID']
    ).execute()

    # Fetch Caption Tracks
    yt = YouTube('https://www.youtube.com/watch?v=' + videoResource['items'][0]['id'])
    caption_tracks = yt.caption_tracks
    
    # Download caption tracks to local srt files
    for track in caption_tracks:
        track.download(videoResource['items'][0]['id'])

    # Create caption objects
    captions = []
    
    f = open(videoResource['items'][0]['id'] + " (en).srt", "r")
    raw_captions = list(srt.parse(f.read()))

    for caption in raw_captions:
        captions.append({
            'content': caption.content,
            'start': srt.timedelta_to_srt_timestamp(caption.start),
        })
        
    caption_chunks = list(chunks(captions, 50))

    # Create objs
    details = videoResource['items'][0]['snippet']
    indx = 0

    for chunk in caption_chunks:
        obj = video.copy()
        
        obj['objectID'] = str(obj['channelId']) + '--' + str(obj['videoID']) + '--' + str(indx)
        obj['part'] = indx
        obj['publishedAt'] = details['publishedAt']
        obj['title'] = details['title']
        obj['description'] = details['description']
        obj['thumbnail'] = details['thumbnails']['high']['url']
        obj['tags'] = details['tags']
        obj['captions'] = chunk
        obj['category'] = next((category['snippet']['title'] for category in categories if category['id'] == details['categoryId']), details['categoryId'])
        
        indx += 1
        videos.append(obj)

In [None]:
# Close the Google API

service.close()

In [149]:
# Upload to Algolia

client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
index = client.init_index(ALGOLIA_INDEX_NAME)

res = index.save_objects(videos, {'autoGenerateObjectIDIfNotExist': False})

AlgoliaUnreachableHostException: Unreachable hosts