In [1]:
import requests
import pandas as pd
import time
import polars as pl
from api import API_KEY

In [3]:
API_KEY = API_KEY

In [None]:
keywords = ["data science", "machine learning", "python tutorial", "artificial intelligence", "data analytics"]
max_results_per_keyword = 2000

all_video_data = []

for keyword in keywords:
    print(f"Searching for: {keyword}")
    search_url = "https://www.googleapis.com/youtube/v3/search"
    next_page_token = None
    total_collected = 0

    while total_collected < max_results_per_keyword:
        params = {
            "part": "snippet",
            "q": keyword,
            "type": "video",
            "maxResults": 50,
            "key": API_KEY}
        if next_page_token:
            params["pageToken"] = next_page_token

        response = requests.get(search_url, params=params)
        data = response.json()
        
        video_ids = [item["id"]["videoId"] for item in data.get("items", [])]
        total_collected += len(video_ids)

        # Fetch stats
        stats_url = "https://www.googleapis.com/youtube/v3/videos"
        stats_params = {
            "part": "snippet,statistics",
            "id": ",".join(video_ids),
            "key": API_KEY
        }
        stats_response = requests.get(stats_url, params=stats_params)
        stats_data = stats_response.json()

        for item in stats_data.get("items", []):
            snippet = item["snippet"]
            stats = item["statistics"]

            all_video_data.append({
                "keyword": keyword,
                "video_id": item["id"],
                "title": snippet["title"],
                "channel": snippet["channelTitle"],
                "published": snippet["publishedAt"],
                "views": int(stats.get("viewCount", 0)),
                "likes": int(stats.get("likeCount", 0)),
                "comments": int(stats.get("commentCount", 0))})

        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break

        time.sleep(1)  # To stay within quota

print(f"Total videos collected: {len(all_video_data)}")
df = pd.DataFrame(all_video_data)
df.to_csv("youtube_videos.csv", index=False)
df.head()

Searching for: data science
Searching for: machine learning
Searching for: python tutorial
Searching for: artificial intelligence
Searching for: data analytics
Searching for: ai
Searching for: ml
Total videos collected: 659


Unnamed: 0,keyword,video_id,title,channel,published,views,likes,comments
0,data science,RBSUwFGa6Fk,What is Data Science?,IBM Technology,2022-06-13T12:00:14Z,771704,16497,300
1,data science,ua-CiDNNj30,Learn Data Science Tutorial - Full Course for ...,freeCodeCamp.org,2019-05-30T12:48:19Z,4001271,83497,1292
2,data science,FsSrzmRawUg,Intro to Data Science: What is Data Science?,Steve Brunton,2019-06-06T05:19:06Z,121919,1744,42
3,data science,9R3X0JoCLyU,The Complete Data Science Roadmap,Programming with Mosh,2024-08-01T13:00:08Z,412551,13602,348
4,data science,X3paOmcrTjQ,Data Science In 5 Minutes | Data Science For B...,Simplilearn,2018-12-04T14:30:01Z,4413159,62024,1101


In [None]:
df.shape

In [None]:
df["keyword"].value_counts()