In [18]:
import os
import csv
import re
import time
import random
from collections import defaultdict
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

In [19]:

# --- CONFIG: API Keys ---
API_KEYS = [
            "AIzaSyDPSqJudfBLzj6Z-izKUN6DmdaSPC8WFpg",
            "AIzaSyDF_0DC2m-n1oQlhvAsJWsa2p2LbpVhto4",
            "AIzaSyBY-hES9Xy0gynLTZpNeAcp14Xz_ldkQ2Q",
            "AIzaSyDB5__J5edGc6TRGDmaKZDG_KGahXXLS74",
            
            "AIzaSyB2KEWJrRO9hqHBQGYh5eMJR4-MQ4DB1Mo",
            "AIzaSyAHeARtuLLIL-sInS07z_h7BwGlg9JrLWA",
            "AIzaSyChagAJk_qPvVkz7DgtiHGallt-HoIOX2w",
            "AIzaSyAF81-PjYUxVvkYBUPGHtl_eRQrsc-Muow",
            "AIzaSyDW-AFnoOAAS6NR-0Ii8_LINfbs8iCb13U",
           
            "AIzaSyAOD_0FBQ9UIHb9SSJc276yUXjnvqR4U9Y",
            "AIzaSyALhuxQd7E5whXErpSRzKiSo0AkUzHgZ0I", 
            "AIzaSyAUx3VdZFy9BL3zZ9IW7xesmm1Uk50B9m8", 
            "AIzaSyCQ1nsL1E7R3amaBf8nycoXIbazDfHc2h8",

            "AIzaSyC-uc79BhqxpYv9lYn-gDv5Ksgsk6HAVkA",
            "AIzaSyCpbaDn_1QrbpabT78XjYSDqqrBj9Mul3M",
            "AIzaSyDZaHVcLMAt4ybeKmcg6kc1QWr7UImXUvk",
            "AIzaSyBqPEXiaitV7JS4_bNur13Elds0QpcTkCw",
            "AIzaSyAV-GK-LzA6tam7lHhLJ-_ysL4r4JKCWgA",
            "AIzaSyAp-dIFALPjFBuknzbFQ5ijfDwy2idDxTc",
            "AIzaSyDmHSXtCumihEQL2yr2qUxf3WQGzUA7Enc",
           
            "AIzaSyDFbglyzqzbrOdPsYXkL8X9rfFCdwiseH0",
            "AIzaSyBwRfVuq1pDI7dhOgKVSBL6xX0U1NepQ8s",
            "AIzaSyA8mbtCqaJLZn94cKlSM7IafHnWRh7JrNg",
            "AIzaSyDxIIhH7h9tiS9Oe6lk2c1mZnCCVmaDviI",
            "AIzaSyA1mGZ4N7RTABnIOEXdTdfGpz8xnHgwBNA",

            "AIzaSyAdDUtZA1Jj_GIvX3EEos9j9eGSpXkODQs",
            "AIzaSyAdfUdlPhINTAsf1CZ-NyBA2RGq3XIrZeg",

            "AIzaSyC7Ohnil-cT9vKYY27hEF8xCJ4Yyov8nJI",
            "AIzaSyA27JoITYWc2ooi3UGFlF-bmh2SIDh7Qv8"
           ]
key_index = 0

def get_youtube_service():
    global key_index
    key = API_KEYS[key_index % len(API_KEYS)]
    key_index += 1
    return build("youtube", "v3", developerKey=key)

youtube = get_youtube_service()

In [20]:
# --- UTILS ---
def safe_filename(name):
    return re.sub(r"[^\w]+", "_", name).strip("_").lower()

def normalize_text(text):
    return re.sub(r'[^a-z0-9@]', '', text.lower())

def extract_mentions_tags(text):
    mentions = re.findall(r'@([\w\d_]+)', text)
    hashtags = re.findall(r'#(\w+)', text)
    return [normalize_text(m) for m in mentions], hashtags

def execute_with_backoff(request, max_retries=5):
    for n in range(max_retries):
        try:
            return request.execute()
        except HttpError as e:
            if e.resp.status in [403, 500, 503]:
                sleep_time = (2 ** n) + random.uniform(0, 1)
                print(f"[Backoff] Sleeping for {sleep_time:.2f}s due to error: {e}")
                time.sleep(sleep_time)
                global youtube
                youtube = get_youtube_service()
            else:
                raise
    raise Exception("Max retries exceeded")

In [21]:
# --- RESOLVE CHANNEL HANDLE ---
def resolve_channel_handle(handle):
    try:
        response = execute_with_backoff(
            youtube.search().list(q=f"@{handle}", type="channel", part="snippet", maxResults=1)
        )
        if response['items']:
            return response['items'][0]['snippet']['channelId']
    except Exception as e:
        print(f"[WARN] Failed to resolve @{handle}: {e}")
    return None

In [22]:
# --- GET VIDEO DETAILS ---
def get_video_details(video_id):
    try:
        response = execute_with_backoff(
            youtube.videos().list(part="snippet,statistics", id=video_id)
        )
        if not response or not response['items']:
            return None
        item = response['items'][0]
        snippet = item['snippet']
        stats = item.get("statistics", {})
        desc = snippet.get("description", "")
        mentions, hashtags = extract_mentions_tags(desc)
        return {
            "video_id": video_id,
            "title": snippet.get("title"),
            "channel": snippet.get("channelTitle"),
            "channelId": snippet.get("channelId"),
            "published": snippet.get("publishedAt"),
            "description": desc,
            "tags": snippet.get("tags", []),
            "mentions": mentions,
            "hashtags": hashtags,
            "viewCount": int(stats.get("viewCount", 0))
        }
    except Exception as e:
        print(f"[ERROR] Video fetch failed: {e}")
        return None

In [23]:
# --- GET COMMENTS & REPLIES ---
def get_comments_and_replies(video_id, max_results=100):
    comments, replies = [], []
    try:
        response = execute_with_backoff(
            youtube.commentThreads().list(part="snippet,replies", videoId=video_id, maxResults=max_results, textFormat="plainText")
        )
        for item in response.get("items", []):
            top = item['snippet']['topLevelComment']['snippet']
            comment_id = item['snippet']['topLevelComment']['id']
            comments.append({
                "comment_id": comment_id,
                "video_id": video_id,
                "author": top['authorDisplayName'],
                "published_at": top['publishedAt'],
                "text": top['textDisplay']
            })
            for reply in item.get("replies", {}).get("comments", []):
                replies.append({
                    "video_id": video_id,
                    "in_reply_to": comment_id,
                    "parent_author": top['authorDisplayName'],
                    "author": reply['snippet']['authorDisplayName'],
                    "published_at": reply['snippet']['publishedAt'],
                    "text": reply['snippet']['textDisplay']
                })
    except Exception as e:
        print(f"[WARN] Comments fetch failed: {e}")
    return comments, replies

In [24]:
# --- MAIN FUNCTION ---
def search_and_save(query, max_pages=5):
    query_safe = safe_filename(query)
    os.makedirs(query_safe, exist_ok=True)

    if os.path.exists(os.path.join(query_safe, "edges.csv")):
        print(f"[SKIP] Already done: {query}")
        return

    seen_video_ids = set()
    edge_counter = defaultdict(lambda: {"mentions": 0, "views": 0, "video_ids": set()})
    handle_to_channelId = {}

    with open(os.path.join(query_safe, "videos.csv"), "w", newline="", encoding="utf-8") as vf, \
         open(os.path.join(query_safe, "comments.csv"), "w", newline="", encoding="utf-8") as cf, \
         open(os.path.join(query_safe, "replies.csv"), "w", newline="", encoding="utf-8") as rf:

        vw = csv.DictWriter(vf, fieldnames=["video_id", "title", "channel", "channelId", "published", "description", "tags", "mentions", "hashtags", "views"])
        cw = csv.DictWriter(cf, fieldnames=["comment_id", "video_id", "author", "published_at", "text"])
        rw = csv.DictWriter(rf, fieldnames=["video_id", "in_reply_to", "parent_author", "author", "published_at", "text"])
        vw.writeheader(); cw.writeheader(); rw.writeheader()

        next_page_token = None
        for page in range(max_pages):
            print(f"[INFO] Query: {query} — Page {page + 1}/{max_pages}")
            response = execute_with_backoff(
                youtube.search().list(q=query, type="video", part="id", maxResults=50, pageToken=next_page_token)
            )
            if not response: break

            for item in response.get("items", []):
                video_id = item['id']['videoId']
                if video_id in seen_video_ids:
                    continue
                seen_video_ids.add(video_id)

                video_data = get_video_details(video_id)
                if not video_data:
                    continue

                vw.writerow({
                    "video_id": video_id,
                    **{k: video_data[k] for k in ["title", "channel", "channelId", "published", "description"]},
                    "tags": ", ".join(video_data["tags"]),
                    "mentions": ", ".join(video_data["mentions"]),
                    "hashtags": ", ".join(video_data["hashtags"]),
                    "views": video_data["viewCount"]
                })

                for mention in video_data["mentions"]:
                    if mention not in handle_to_channelId:
                        channel_id = resolve_channel_handle(mention)
                        if channel_id:
                            handle_to_channelId[mention] = channel_id
                        else:
                            continue
                    edge_key = (video_data["channelId"], handle_to_channelId[mention])
                    edge_counter[edge_key]["mentions"] += 1
                    edge_counter[edge_key]["views"] += video_data["viewCount"]
                    edge_counter[edge_key]["video_ids"].add(video_id)

                comments, replies = get_comments_and_replies(video_id)
                for c in comments: cw.writerow(c)
                for r in replies: rw.writerow(r)

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

    with open(os.path.join(query_safe, "edges.csv"), "w", newline="", encoding="utf-8") as ef:
        ew = csv.DictWriter(ef, fieldnames=["source_channelId", "target_channelId", "mention_count", "total_views", "video_ids"])
        ew.writeheader()
        for (src, tgt), stats in edge_counter.items():
            ew.writerow({
                "source_channelId": src,
                "target_channelId": tgt,
                "mention_count": stats["mentions"],
                "total_views": stats["views"],
                "video_ids": ";".join(stats["video_ids"])
            })

    print(f"[✔] Done: {query} — Data saved in '{query_safe}'")

In [25]:
# --- MAIN FUNCTION ---
def search_and_save(query, max_pages=5):
    query_safe = safe_filename(query)
    os.makedirs(query_safe, exist_ok=True)

    if os.path.exists(os.path.join(query_safe, "edges.csv")):
        print(f"[SKIP] Already done: {query}")
        return

    seen_video_ids = set()
    edge_counter = defaultdict(lambda: {"mentions": 0, "views": 0, "video_ids": set()})
    handle_to_channelId = {}

    def crawl_channel_mentions(channel_id, current_depth=1, max_depth=1):
        if current_depth > max_depth:
            return
        try:
            response = execute_with_backoff(
                youtube.search().list(part="id", type="video", channelId=channel_id, maxResults=5)
            )
            for item in response.get("items", []):
                vid = item["id"]["videoId"]
                vdata = get_video_details(vid)
                if not vdata:
                    continue
                for m in vdata["mentions"]:
                    if m not in handle_to_channelId:
                        resolved = resolve_channel_handle(m)
                        if resolved:
                            handle_to_channelId[m] = resolved
                        else:
                            continue
                    edge_key = (channel_id, handle_to_channelId[m])
                    edge_counter[edge_key]["mentions"] += 1
                    edge_counter[edge_key]["video_ids"].add(vid)
        except Exception as e:
            print(f"[Recurse WARN] {e}")

    with open(os.path.join(query_safe, "videos.csv"), "w", newline="", encoding="utf-8") as vf,          open(os.path.join(query_safe, "comments.csv"), "w", newline="", encoding="utf-8") as cf,          open(os.path.join(query_safe, "replies.csv"), "w", newline="", encoding="utf-8") as rf:

        vw = csv.DictWriter(vf, fieldnames=["video_id", "title", "channel", "channelId", "published", "description", "tags", "mentions", "hashtags", "views"])
        cw = csv.DictWriter(cf, fieldnames=["comment_id", "video_id", "author", "published_at", "text"])
        rw = csv.DictWriter(rf, fieldnames=["video_id", "in_reply_to", "parent_author", "author", "published_at", "text"])
        vw.writeheader(); cw.writeheader(); rw.writeheader()

        next_page_token = None
        for page in range(max_pages):
            print(f"[INFO] Query: {query} — Page {page + 1}/{max_pages}")
            response = execute_with_backoff(
                youtube.search().list(q=query, type="video", part="id", maxResults=50, pageToken=next_page_token)
            )
            if not response: break

            for item in response.get("items", []):
                video_id = item['id']['videoId']
                if video_id in seen_video_ids:
                    continue
                seen_video_ids.add(video_id)

                video_data = get_video_details(video_id)
                if not video_data:
                    continue

                vw.writerow({
                    "video_id": video_id,
                    **{k: video_data[k] for k in ["title", "channel", "channelId", "published", "description"]},
                    "tags": ", ".join(video_data["tags"]),
                    "mentions": ", ".join(video_data["mentions"]),
                    "hashtags": ", ".join(video_data["hashtags"]),
                    "views": video_data["viewCount"]
                })

                # Layer 1
                for mention in video_data["mentions"]:
                    if mention not in handle_to_channelId:
                        channel_id = resolve_channel_handle(mention)
                        if channel_id:
                            handle_to_channelId[mention] = channel_id
                        else:
                            continue
                    edge_key = (video_data["channelId"], handle_to_channelId[mention])
                    edge_counter[edge_key]["mentions"] += 1
                    edge_counter[edge_key]["views"] += video_data["viewCount"]
                    edge_counter[edge_key]["video_ids"].add(video_id)

                # Layer 4: Recursively analyze mentioned channels
                for m in video_data["mentions"]:
                    ch_id = handle_to_channelId.get(m)
                    if ch_id:
                        crawl_channel_mentions(ch_id)

                comments, replies = get_comments_and_replies(video_id)
                for c in comments: cw.writerow(c)
                for r in replies: rw.writerow(r)

                # Layer 2: Commenter → Mentioned Handle
                for comment in comments:
                    c_mentions, _ = extract_mentions_tags(comment["text"])
                    for m in c_mentions:
                        if m not in handle_to_channelId:
                            resolved = resolve_channel_handle(m)
                            if resolved:
                                handle_to_channelId[m] = resolved
                            else:
                                continue
                        edge_key = (comment["author"], handle_to_channelId[m])
                        edge_counter[edge_key]["mentions"] += 1
                        edge_counter[edge_key]["video_ids"].add(video_id)

                # Layer 3: Replier → Parent Commenter
                for reply in replies:
                    if reply["author"] != reply["parent_author"]:
                        edge_key = (reply["author"], reply["parent_author"])
                        edge_counter[edge_key]["mentions"] += 1
                        edge_counter[edge_key]["video_ids"].add(reply["video_id"])

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

    with open(os.path.join(query_safe, "edges.csv"), "w", newline="", encoding="utf-8") as ef:
        ew = csv.DictWriter(ef, fieldnames=["source_channelId", "target_channelId", "mention_count", "total_views", "video_ids"])
        ew.writeheader()
        for (src, tgt), stats in edge_counter.items():
            ew.writerow({
                "source_channelId": src,
                "target_channelId": tgt,
                "mention_count": stats["mentions"],
                "total_views": stats.get("views", 0),
                "video_ids": ";".join(stats["video_ids"])
            })

    print(f"[✔] Done: {query} — Data saved in '{query_safe}'")

In [26]:
# --- QUERIES ---
queries = [
    # "Sabrina Carpenter music video", 
    # "Sabrina Carpenter personal life",
    # "Sabrina Carpenter reactions",
    # "Sabrina Carpenter interviews",
    
    # "Sabrina Carpenter acting career",
    # "Sabrina Carpenter live performance",
    # " Sabrina Carpenter controversy"
]

for q in queries:
    search_and_save(q)

[INFO] Query: sabrina carpenter vlog — Page 1/5
[Backoff] Sleeping for 1.07s due to error: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=sabrina+carpenter+vlog&type=video&part=id&maxResults=50&key=AIzaSyDPSqJudfBLzj6Z-izKUN6DmdaSPC8WFpg&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
[Backoff] Sleeping for 2.43s due to error: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=sabrina+carpenter+vlog&type=video&part=id&maxResults=50&key=AIzaSyDPSqJudfBLzj6Z-izKUN6DmdaSPC8WFpg&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 

Exception: Max retries exceeded

In [84]:
# === CONFIGURATION ===
DAILY_QUOTA_PER_KEY = 9500
QUOTA_UNIT_PER_REQUEST = 1
MAX_DAILY_QUOTA = DAILY_QUOTA_PER_KEY * len(API_KEYS)

# === YOUTUBE SERVICE ROTATOR ===
api_key_index = 0

def get_youtube_service():
    global api_key_index
    api_key = API_KEYS[api_key_index]
    api_key_index = (api_key_index + 1) % len(API_KEYS)
    return build("youtube", "v3", developerKey=api_key)

def execute_with_backoff(request):
    retries = 5
    for i in range(retries):
        try:
            return request.execute()
        except HttpError as e:
            if e.resp.status in [403, 500, 503]:
                wait = (2 ** i) + random.uniform(0, 1)
                print(f"Retrying in {wait:.2f} seconds due to error: {e}")
                time.sleep(wait)
            else:
                raise

In [85]:
def fetch_channel_names(channel_ids, cache_file="channel_names.csv"):
    import os

    # Load existing cache
    if os.path.exists(cache_file):
        existing_df = pd.read_csv(cache_file)
        existing = dict(zip(existing_df["channel_id"], existing_df["channel_name"]))
    else:
        existing_df = pd.DataFrame(columns=["channel_id", "channel_name"])
        existing = {}

    # Filter out only valid channel IDs (i.e., those starting with 'UC')
    valid_channel_ids = [cid for cid in channel_ids if isinstance(cid, str) and cid.startswith("UC")]
    to_fetch = list(set(valid_channel_ids) - set(existing.keys()))

    print(f"Found {len(existing)} already fetched. Fetching {len(to_fetch)} new valid channel IDs...")

    new_data = []
    quota_used = 0

    for i in range(0, len(to_fetch), 50):
        if quota_used >= MAX_DAILY_QUOTA:
            print("Reached daily quota limit.")
            break

        batch = to_fetch[i:i + 50]
        youtube = get_youtube_service()

        try:
            response = execute_with_backoff(
                youtube.channels().list(part="snippet", id=",".join(batch))
            )
            for item in response.get("items", []):
                cid = item["id"]
                title = item["snippet"]["title"]
                existing[cid] = title
                new_data.append((cid, title))

        except HttpError as e:
            error_reason = None
            try:
                error_reason = e.error_details[0]['reason']
            except:
                pass
            print(f"Error: {e}")
            if "quotaExceeded" in str(e) or error_reason == "quotaExceeded":
                print("Quota exceeded. Stopping fetch.")
                break
            continue

        # Write batch immediately to cache to avoid loss
        if new_data:
            batch_df = pd.DataFrame(new_data, columns=["channel_id", "channel_name"])
            existing_df = pd.concat([existing_df, batch_df], ignore_index=True).drop_duplicates(subset="channel_id")
            existing_df.to_csv(cache_file, index=False)
            print(f"Saved batch of {len(new_data)}. Total cached: {len(existing_df)}")
            new_data = []

        quota_used += len(batch)
        time.sleep(0.1)  # Optional to reduce rate

    print("Channel name fetching completed.")
    return existing_df

In [86]:
def add_channel_names_to_csv(input_file, output_file, columns_to_map, name_map_df):
    df = pd.read_csv(input_file)
    map_dict = dict(zip(name_map_df["channel_id"], name_map_df["channel_name"]))

    for col in columns_to_map:
        name_col = f"{col}_name"
        df[name_col] = df[col].map(map_dict).fillna("Unknown")

    df.to_csv(output_file, index=False)
    print(f"Saved updated file: {output_file}")


In [105]:
def main():
    base_path = "sabrina_carpenter_controversy"
    # "sabrina_carpenter_vlog"

    # Load all channel/account IDs from CSVs
    edge_df = pd.read_csv(os.path.join(base_path, "edges.csv"))

    ids = set()
    ids.update(edge_df["source_channelId"].dropna().unique())
    ids.update(edge_df["target_channelId"].dropna().unique())
    
    # Fetch and cache channel names
    cache_file = os.path.join(base_path, "channel_names.csv")
    name_df = fetch_channel_names(list(ids), cache_file=cache_file)

    # Add names to each file
    add_channel_names_to_csv(
        os.path.join(base_path, "edges.csv"),
        os.path.join(base_path, "edges_named.csv"),
        ["source_channelId", "target_channelId"],
        name_df,
    )

if __name__ == "__main__":
    main()

Found 0 already fetched. Fetching 61 new valid channel IDs...
[Backoff] Sleeping for 1.20s due to error: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=snippet&id=UCVjj_osiKAP_qTtJAr6IrvA%2CUCJJhdAgEbBGZHUa5n0tBP3g%2CUCT7xAt9yyxfBnks4k9XyAkQ%2CUC0LfERYIbu4Qc0ssLiDWdNw%2CUCzBm4uZ0RKT1u8CSdSwzG-w%2CUCIPM9B55iGbfpbxQh4Sya-g%2CUCe7KQjGEUloxCv5yB4fSIOA%2CUCWljxewHlJE3M7U_6_zFNyA%2CUCVVuSskTN_iHl_A-Tb8yA1w%2CUC_6hQy4elsyHhCOskZo0U5g%2CUCfXFq0qbfJj5q35l6HE-7sg%2CUCR1D15p_vdP3HkrH8wgjQRw%2CUCE-nOavlAVYoXGKS6zxvodA%2CUCPKWE1H6xhxwPlqUlKgHb_w%2CUCICMWH9LAtAh8sd9FoOfEfQ%2CUCYRE3PMYatMVXmqmulF0sgA%2CUCNGIytL4kzna9zJzAHaoZ2g%2CUCyYhxPxa-bYr-IIAGIV33PQ%2CUCO11MeHgFejntXiaqFLyo_w%2CUChfrWzVNU6qN0690-7yLoiw%2CUCpbWpuupuBZS0wtvLPOcuWQ%2CUCjYlRtRXsSQN_wYokukB-Ew%2CUCaaItXO5SYv_MTuExUlh-tA%2CUCQLvcN41js-UutOFkUFHLmA%2CUC39eHhTEqiBlqRbmIIYjm-g%2CUC_cvTMeip9po2hZdF3aBXrA%2CUCEMVnIDQua4ZJlUsQ6G775w%2CUC4oVjJptcsvtgIMrdIUQYlQ%2CUC-6-LnSINysVagonyKlQBcg%2CUCflpy-GNxlahb

Exception: Max retries exceeded

In [None]:
import os
import pandas as pd

# List of all relevant folders
folders = [
    "sabrina_carpenter_personal_life",
    "sabrina_carpenter_music_video",
    "sabrina_carpenter_reactions",
    "sabrina_carpenter_interviews",
    "sabrina_carpenter_acting_career",
    "sabrina_carpenter_live_performance",
    "sabrina_carpenter_controversy"
]

# Map of input filenames (to look for in each folder) and the corresponding combined output filename
file_map = {
    "comments.csv": "comments_combined.csv",
    "replies.csv": "replies_combined.csv",
    "videos.csv": "videos_combined.csv",
    "edges_named.csv": "edges_combined.csv"
}

def combine_files():
    for input_file, output_file in file_map.items():
        combined_df = pd.DataFrame()
        for folder in folders:
            file_path = os.path.join(folder, input_file)
            if os.path.exists(file_path):
                print(f"Loading {file_path}...")
                df = pd.read_csv(file_path)
                df["source_folder"] = folder  # Optional: track origin
                combined_df = pd.concat([combined_df, df], ignore_index=True)
            else:
                print(f"File {input_file} not found in {folder}")
        combined_df.to_csv(output_file, index=False)
        print(f"Combined file saved as: {output_file}")
        print(f"Columns in {output_file}:")
        print(list(combined_df.columns))
        print("-" * 60)

if __name__ == "__main__":
    combine_files()

Loading sabrina_carpenter_personal_life\comments.csv...
Loading sabrina_carpenter_music_video\comments.csv...
Loading sabrina_carpenter_reactions\comments.csv...
Loading sabrina_carpenter_interviews\comments.csv...
Loading sabrina_carpenter_acting_career\comments.csv...
Loading sabrina_carpenter_live_performance\comments.csv...
Loading sabrina_carpenter_controversy\comments.csv...
✅ Combined file saved as: comments_combined.csv
📊 Columns in comments_combined.csv:
['comment_id', 'video_id', 'author', 'published_at', 'text', 'source_folder']
------------------------------------------------------------
Loading sabrina_carpenter_personal_life\replies.csv...
Loading sabrina_carpenter_music_video\replies.csv...
Loading sabrina_carpenter_reactions\replies.csv...
Loading sabrina_carpenter_interviews\replies.csv...
Loading sabrina_carpenter_acting_career\replies.csv...
Loading sabrina_carpenter_live_performance\replies.csv...
Loading sabrina_carpenter_controversy\replies.csv...
✅ Combined file 