In [2]:
import json
from typing import Dict, List

saved_keywords: Dict[str, List[str]] = json.load(
    open("data/generated_keywords.json", "r", encoding="utf-8")
)

CATEGORIES = list(saved_keywords.keys())

In [3]:
CATEGORIES

['Food',
 'Travel',
 'Entertainment',
 'Gaming',
 'Lifestyle',
 'Education',
 'Sports',
 'Technology',
 'Pets & Animals',
 'Health & Wellness']

In [4]:
# a function to get all youtube search results for a given keyword
import os
from youtubesearchpython import VideosSearch
from typing import List


def get_youtube_search_results(keyword: str) -> List[str]:
    if os.path.exists(f"cache/youtube_search_results/{keyword}.json"):
        with open(f"cache/youtube_search_results/{keyword}.json", "r") as f:
            return json.load(f)

    videosSearch = VideosSearch(
        keyword,
        limit=10,
        region="VN",
        language="vi",
    )
    results = videosSearch.result()["result"]
    ids = [result["id"] for result in results]

    with open(f"cache/youtube_search_results/{keyword}.json", "w") as f:
        json.dump(ids, f, indent=2, ensure_ascii=False)

    return ids


# a function to iterate through all keywords and get all youtube search results
# filter out duplicate results
from tqdm import tqdm
from typing import Dict, List


def get_all_youtube_search_results(
    keywords: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    all_youtube_search_results = {}
    for category in tqdm(keywords):
        all_youtube_search_results[category] = []
        for keyword in tqdm(keywords[category]):
            results = get_youtube_search_results(keyword)
            all_youtube_search_results[category] += results
        all_youtube_search_results[category] = list(
            set(all_youtube_search_results[category])
        )

    return all_youtube_search_results


all_youtube_search_results = get_all_youtube_search_results(saved_keywords)

100%|██████████| 124/124 [00:00<00:00, 55619.05it/s]
100%|██████████| 106/106 [00:00<00:00, 52953.34it/s]
100%|██████████| 131/131 [00:00<00:00, 52483.89it/s]
100%|██████████| 113/113 [00:00<00:00, 62527.22it/s]
100%|██████████| 109/109 [00:00<00:00, 54995.69it/s]
100%|██████████| 146/146 [00:00<00:00, 54437.58it/s]
100%|██████████| 136/136 [00:00<00:00, 57363.77it/s]
100%|██████████| 126/126 [00:00<00:00, 40781.10it/s]
100%|██████████| 179/179 [00:00<00:00, 40436.28it/s]
100%|██████████| 125/125 [00:00<00:00, 43792.85it/s]
100%|██████████| 10/10 [00:00<00:00, 211.37it/s]


In [5]:
# get unique search results through all categories
# if a video appears in multiple categories, it will only be counted once in the first category

unique_search_results = {}
for category in all_youtube_search_results:
    unique_search_results[category] = []
    for video_id in all_youtube_search_results[category]:
        found = False
        for category2 in unique_search_results:
            if video_id in unique_search_results[category2]:
                found = True
                break
        if not found:
            unique_search_results[category].append(video_id)

In [9]:
len([video_id for category in unique_search_results for video_id in unique_search_results[category]])

7943

In [10]:
# write the results to a json file
with open("data/youtube_search_results.json", "w", encoding="utf-8") as f:
    json.dump(unique_search_results, f, indent=2, ensure_ascii=False)