In [1]:
import os
import time
import json
import itertools
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timezone

In [2]:
API_KEYS = [
    "AIzaSy*******************",  # Project 1 key

    

      
]

SEARCH_QUERIES = [
    "tech influencer",
    "fashion influencer",
    "fitness influencer",
    "travel influencer",
    "education influencer",
    "gaming influencer",
    "music creator",
    "beauty vlogger",
    "food blogger",
    "pet influencer",
    "finance creator",
    "Sustainability Influencers",
    "lifestyle influencer",
    "parenting influencer",
    "Motivational Influencers",
    "motivational speaker",
    "DIY influencer",
    "photography vlogger",
]

OUTPUT_FILE = "youtube_influencers_finally.csv"
MAX_INFLUENCERS = 5000
SAVE_EVERY = 100  # save progress every N channels
MAX_RESULTS_PER_QUERY = 50

In [3]:
api_cycle = itertools.cycle(API_KEYS)
current_key = next(api_cycle)


def create_youtube_client(api_key):
    return build("youtube", "v3", developerKey=api_key, cache_discovery=False)


youtube = create_youtube_client(current_key)
print(f" Using API key: {current_key}")


def switch_api_key():
    global youtube, current_key
    current_key = next(api_cycle)
    print(f" Switching API key to: {current_key}")
    youtube = create_youtube_client(current_key)

 Using API key: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA


In [4]:
9

9

In [None]:
all_data = []
collected_ids = set()

for query in SEARCH_QUERIES:
    print(f"\n🔍 Searching for: {query}")
    next_page_token = None

    while len(all_data) < MAX_INFLUENCERS:
        try:
            res = (
                youtube.search()
                .list(
                    part="snippet",
                    type="channel",
                    q=query,
                    maxResults=MAX_RESULTS_PER_QUERY,
                    pageToken=next_page_token,
                )
                .execute()
            )

            channels = res.get("items", [])
            channel_ids = [c["id"]["channelId"] for c in channels]

            # --- Get stats for each channel ---
            stats = (
                youtube.channels()
                .list(part="snippet,statistics", id=",".join(channel_ids))
                .execute()
            )

            for ch in stats.get("items", []):
                ch_id = ch["id"]
                if ch_id in collected_ids:
                    continue

                # --- Account Age ---
                published_date = ch.get("snippet", {}).get("publishedAt")
                account_age_days = None
                if published_date:
                    try:
                        published_date_clean = published_date.split(".")[0].replace("Z", "")
                        published_dt = datetime.strptime(published_date_clean, "%Y-%m-%dT%H:%M:%S")
                        published_dt = published_dt.replace(tzinfo=timezone.utc)
                        account_age_days = (datetime.now(timezone.utc) - published_dt).days
                    except Exception as e:
                        print(f"⚠️ Error parsing date for {ch_id}: {published_date} → {e}")
                        account_age_days = None
                else:
                    account_age_days = None

                # --- Basic Channel Info ---
                data = {
                    "channelId": ch_id,
                    "title": ch["snippet"]["title"],
                    "description": ch["snippet"].get("description", ""),
                    "publishedAt": ch["snippet"].get("publishedAt", ""),
                    "subscriberCount": int(ch["statistics"].get("subscriberCount", 0)),
                    "viewCount": int(ch["statistics"].get("viewCount", 0)),
                    "videoCount": int(ch["statistics"].get("videoCount", 0)),
                    "accountAgeDays": account_age_days if account_age_days else 0,
                    "avgLikes": 0,
                    "avgComments": 0,
                    "EngagementRate": 0.0,
                    "PostFrequency": 0.0,
                    "query": query,
                }

                # --- Skip small channels ---
                if data["subscriberCount"] < 5000:
                    continue

                # --- Fetch recent videos for avg likes/comments ---
                try:
                    videos_res = (
                        youtube.search()
                        .list(
                            part="id,snippet",
                            channelId=ch_id,
                            maxResults=10,
                            order="date",
                            type="video",
                        )
                        .execute()
                    )

                    video_ids = [v["id"]["videoId"] for v in videos_res.get("items", [])]

                    if video_ids:
                        video_stats = (
                            youtube.videos()
                            .list(part="statistics", id=",".join(video_ids))
                            .execute()
                        )

                        like_counts = []
                        comment_counts = []
                        for v in video_stats.get("items", []):
                            stats_v = v.get("statistics", {})
                            like_counts.append(int(stats_v.get("likeCount", 0)))
                            comment_counts.append(int(stats_v.get("commentCount", 0)))

                        # Averages
                        data["avgLikes"] = int(sum(like_counts) / len(like_counts)) if like_counts else 0
                        data["avgComments"] = int(sum(comment_counts) / len(comment_counts)) if comment_counts else 0

                    time.sleep(0.5)

                except HttpError as e:
                    print(f"⚠️ Error fetching video stats for {ch_id}: {e}")
                    data["avgLikes"] = 0
                    data["avgComments"] = 0

                # --- Post Frequency = total posts / account age ---
                total_videos = data["videoCount"]
                if data["accountAgeDays"] > 0:
                    data["PostFrequency"] = round(total_videos / data["accountAgeDays"], 4)
                else:
                    data["PostFrequency"] = 0.00

                # --- Engagement Rate ---
                if data["subscriberCount"] > 0:
                    data["EngagementRate"] = round(
                        (data["avgLikes"] + data["avgComments"]) / data["subscriberCount"], 4
                    )
                else:
                    data["EngagementRate"] = 0.0

                # --- Append and save ---
                all_data.append(data)
                collected_ids.add(ch_id)

                if len(all_data) % SAVE_EVERY == 0:
                    pd.DataFrame(all_data).to_csv(OUTPUT_FILE, index=False)
                    print(f"💾 Saved {len(all_data)} influencers so far...")

            next_page_token = res.get("nextPageToken")
            if not next_page_token:
                break

        except HttpError as e:
            if "quotaExceeded" in str(e):
                print("🚫 Quota exceeded. Switching API key...")
                switch_api_key()
                continue
            elif "userRateLimitExceeded" in str(e):
                print("⏳ Temporary rate limit. Waiting 10s...")
                time.sleep(10)
                continue
            else:
                print(f"⚠️ API error: {e}")
                time.sleep(5)
                continue

        if len(all_data) >= MAX_INFLUENCERS:
            break



🔍 Searching for: tech influencer
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switching API key to: AIzaSyB9tXsa6KmTtFJYQjL1PciBerPzmIYltKA
🚫 Quota exceeded. Switching API key...
 Switchin

In [None]:
pd.DataFrame(all_data).to_csv(OUTPUT_FILE, index=False)
print(f"\n✅ Finished! Collected {len(all_data)} influencer channels.")
print(f"📊 Data saved to {OUTPUT_FILE}")