In [1]:
import json
from typing import Dict, List

saved_keywords: Dict[str, List[str]] = json.load(
    open("generated_keywords.json", "r", encoding="utf-8")
)

CATEGORIES = list(saved_keywords.keys())

In [2]:
CATEGORIES

['Food',
 'Travel',
 'Entertainment',
 'Gaming',
 'Lifestyle',
 'Education',
 'Sports',
 'Technology',
 'Pets & Animals',
 'Health & Wellness']

In [5]:
# a function to get all youtube search results for a given keyword
import os
from youtubesearchpython import VideosSearch
from typing import List


def get_youtube_search_results(keyword: str) -> List[str]:
    if os.path.exists(f"cache/youtube_search_results/{keyword}.json"):
        with open(f"cache/youtube_search_results/{keyword}.json", "r") as f:
            return json.load(f)

    videosSearch = VideosSearch(
        keyword,
        limit=10,
        region="VN",
        language="vi",
    )
    results = videosSearch.result()["result"]
    ids = [result["id"] for result in results]

    with open(f"cache/youtube_search_results/{keyword}.json", "w") as f:
        json.dump(ids, f, indent=2, ensure_ascii=False)

    return ids


# a function to iterate through all keywords and get all youtube search results
# filter out duplicate results
from tqdm import tqdm
from typing import Dict, List


def get_all_youtube_search_results(
    keywords: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    all_youtube_search_results = {}
    for category in tqdm(keywords):
        all_youtube_search_results[category] = []
        for keyword in tqdm(keywords[category]):
            results = get_youtube_search_results(keyword)
            all_youtube_search_results[category] += results
        all_youtube_search_results[category] = list(
            set(all_youtube_search_results[category])
        )

    return all_youtube_search_results


all_youtube_search_results = get_all_youtube_search_results(saved_keywords)

# write the results to a json file
with open("all_youtube_search_results.json", "w", encoding="utf-8") as f:
    json.dump(all_youtube_search_results, f, indent=2, ensure_ascii=False)

100%|██████████| 124/124 [01:09<00:00,  1.78it/s]
100%|██████████| 106/106 [01:09<00:00,  1.52it/s]
100%|██████████| 131/131 [01:29<00:00,  1.47it/s]
100%|██████████| 113/113 [01:19<00:00,  1.42it/s]
100%|██████████| 109/109 [01:03<00:00,  1.73it/s]
100%|██████████| 146/146 [01:37<00:00,  1.50it/s]
100%|██████████| 136/136 [01:20<00:00,  1.68it/s]
100%|██████████| 126/126 [01:25<00:00,  1.47it/s]
100%|██████████| 179/179 [01:49<00:00,  1.64it/s]
100%|██████████| 125/125 [01:13<00:00,  1.70it/s]
100%|██████████| 10/10 [13:38<00:00, 81.85s/it]
