# Downloading and Preparing YouTube Transcripts  
Transcripts were extracted directly from selected YouTube videos using an automated tool, then saved in structured text format for further cleaning and analysis.  


### Importing Libraries

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import requests
import os
import openai
import time
from tqdm import tqdm

### Download YouTube Transcripts and Titles to CSV


In [29]:
# Your YouTube API Key
API_KEY = "AIzaSyDQPspanwFpHigCg7JjzPEGvcFMl0STPW0"

# List of YouTube Video IDs
VIDEO_IDS = [
    "DLgNW9lCAaU", "r4K8V8btCtY", "kRhg1dPS4Gw", "Cj8wtEFNVog", "_CLZf58vzbc",
    "dbVki3gPYZs", "C9sH8AD4jys", "6zEf9o-tTpc", "_tSI_JV5lZY", "ReDGjoFTn58",
    "yRy2FOpCvws"
]

# Output file
OUTPUT_FILE = "YouTube_Transcripts.csv"

# Function to get video title
def get_video_title(video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "items" in data and len(data["items"]) > 0:
            return data["items"][0]["snippet"]["title"]
    return "Unknown Title"

# Check if the CSV file already exists
if os.path.exists(OUTPUT_FILE):
    existing_df = pd.read_csv(OUTPUT_FILE)
    existing_video_ids = existing_df["Video_ID"].tolist()
else:
    existing_df = pd.DataFrame()
    existing_video_ids = []

# List to store transcript data
transcripts_data = []

# Loop through each video ID
for video_id in VIDEO_IDS:
    if video_id in existing_video_ids:
        print(f" Transcript for {video_id} already exists. Skipping...")
        continue
    try:
        video_title = get_video_title(video_id)  # Fetch video title
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([entry["text"] for entry in transcript])

        # Append to list
        transcripts_data.append({
            "Video_ID": video_id,
            "Video_Title": video_title,
            "Transcript": transcript_text
        })

        print(f"✅ Downloaded transcript for {video_title} ({video_id})")

    except Exception as e:
        print(f" Could not fetch transcript for {video_id}: {e}")

# Convert to DataFrame and append to the existing CSV file
new_df = pd.DataFrame(transcripts_data)
if not new_df.empty:
    if not existing_df.empty:
        final_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        final_df = new_df
    final_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print(f"📂 New transcripts added to {OUTPUT_FILE}")
else:
    print(" No new transcripts were added.")


✅ Downloaded transcript for Why the Saudi Arabian Grand Prix is so controversial (DLgNW9lCAaU)
✅ Downloaded transcript for Saudi Arabia Exploits F-1 Racing For Sportswashing (r4K8V8btCtY)
✅ Downloaded transcript for Formula 1: Lewis Hamilton speaks out about LGBTQ+ rights in Saudi Arabia (kRhg1dPS4Gw)
✅ Downloaded transcript for Stefano Domenicali responds to criticisms of F1 hosting races in Qatar & Saudi Arabia (Cj8wtEFNVog)
✅ Downloaded transcript for Bahrain's Grand Prix Sparks Human Rights Protests (_CLZf58vzbc)
✅ Downloaded transcript for Sir Lewis Hamilton confronts human rights issues as Formula One returns (dbVki3gPYZs)
✅ Downloaded transcript for Bernie Ecclestone: F1 and human rights concerns (C9sH8AD4jys)
✅ Downloaded transcript for Formula 1's Europe Problem (6zEf9o-tTpc)
✅ Downloaded transcript for I FLEW to a F1 race in the MIDDLE EAST... (_tSI_JV5lZY)
✅ Downloaded transcript for How the Abu Dhabi F1 Track was Built (ReDGjoFTn58)
✅ Downloaded transcript for David Beckham

### Set up imports, API key, and file paths


In [None]:
# OpenAI API Key
OPENAI_API_KEY = "************"  

# Input and output file paths
INPUT_FILE = "YouTube_Transcripts.csv"
OUTPUT_FILE = "Processed_YouTube_Transcripts.csv"

# Expected columns in the input
REQUIRED_COLUMNS = ["Video_ID", "Video_Title", "Transcript"]


### Check input file and set up output CSV


In [None]:
# Check if input file exists
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f" Input file '{INPUT_FILE}' not found!")

# Load the CSV file
df = pd.read_csv(INPUT_FILE)

# Validate required columns
for col in REQUIRED_COLUMNS:
    if col not in df.columns:
        raise KeyError(f" Missing required column: {col} in CSV file!")

# Initialize output CSV if it doesn't exist
if not os.path.exists(OUTPUT_FILE):
    pd.DataFrame(columns=[
        "Video_ID", "Video_Title", "Transcript", 
        "Summary", "Topic", "Sentiment_Score", "Misinformation_Flag"
    ]).to_csv(OUTPUT_FILE, index=False)


### Define function to process transcript with GPT-4o


In [None]:
def process_transcript(transcript):
    if pd.isna(transcript) or transcript.strip() == "":
        return {
            "Summary": "No meaningful content",
            "Topic": "Uncategorized",
            "Sentiment_Score": 0,
            "Misinformation_Flag": "Unknown"
        }

    prompt = f"""
    You are analyzing a transcript for a research study on sportswashing and misinformation in global sports. Your tasks:

    1. Summarize the transcript (≤ 600 words)
    2. Classify it into one of:
       - Sportswashing
       - Human Rights Issues
       - Geopolitical Influence
       - Financial Ethics in Sports
       - Misinformation in Sports
       - Other
    3. Assign a sentiment score:
       - -1 = Negative (critical of Gulf investments, ethics, rights)
       -  0 = Neutral (balanced or factual)
       -  1 = Positive (supportive of Gulf events/investments)
    4. Detect misinformation (Yes/No)

    Response format:
    Summary: <...>
    Topic: <...>
    Sentiment_Score: <-1/0/1>
    Misinformation_Flag: <Yes/No>

    Now process this transcript:
    {transcript}
    """

    client = openai.OpenAI(api_key=OPENAI_API_KEY)

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are analyzing YouTube transcripts for a sportswashing research study."},
                {"role": "user", "content": prompt}
            ],
            timeout=20
        )

        response_text = response.choices[0].message.content.strip()
        result_lines = response_text.split("\n")

        summary, topic, sentiment_score, misinformation_flag = "Error in processing", "Error in classification", 0, "Unknown"

        for line in result_lines:
            if line.startswith("Summary:"):
                summary = line.replace("Summary:", "").strip()
            elif line.startswith("Topic:"):
                topic = line.replace("Topic:", "").strip()
            elif line.startswith("Sentiment_Score:"):
                val = line.replace("Sentiment_Score:", "").strip()
                sentiment_score = {"-1": -1, "0": 0, "1": 1}.get(val, 0)
            elif line.startswith("Misinformation_Flag:"):
                misinformation_flag = line.replace("Misinformation_Flag:", "").strip()

        if not topic or topic.lower() in ["", "none", "error in classification"]:
            print(f" Empty topic detected, retrying classification...")
            topic = retry_topic_classification(transcript)

        return {
            "Summary": summary,
            "Topic": topic,
            "Sentiment_Score": sentiment_score,
            "Misinformation_Flag": misinformation_flag
        }

    except Exception as e:
        print(f"Error extracting GPT-4o response: {e}")
        return {
            "Summary": "Error in processing",
            "Topic": "Error in classification",
            "Sentiment_Score": 0,
            "Misinformation_Flag": "Unknown"
        }


### Add fallback if GPT doesn't return a topic


In [None]:
def retry_topic_classification(transcript):
    retry_prompt = f"""
    Classify this transcript into one of the following categories:
    - Sportswashing
    - Human Rights Issues
    - Geopolitical Influence
    - Financial Ethics in Sports
    - Misinformation in Sports
    - Other

    Response format:
    Topic: <One of the six categories>

    Transcript:
    {transcript}
    """

    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are analyzing transcript topics for a research study."},
                {"role": "user", "content": retry_prompt}
            ],
            timeout=10
        )

        topic_value = response.choices[0].message.content.replace("Topic:", "").strip()
        return topic_value if topic_value else "Other"
    except Exception as e:
        print(f" Topic retry failed: {e}")
        return "Other"


### Loop through transcripts and save results to CSV


In [8]:
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Transcripts"):
    transcript = row["Transcript"]

    try:
        processed_result = process_transcript(transcript)
        new_row = {
            "Video_ID": row["Video_ID"],
            "Video_Title": row["Video_Title"],
            "Transcript": transcript,
            "Summary": processed_result["Summary"],
            "Topic": processed_result["Topic"],
            "Sentiment_Score": processed_result["Sentiment_Score"],
            "Misinformation_Flag": processed_result["Misinformation_Flag"]
        }

        pd.DataFrame([new_row]).to_csv(OUTPUT_FILE, mode='a', header=False, index=False, encoding="utf-8")

    except Exception as e:
        print(f"Failed to process transcript: {transcript[:50]}... | Error: {e}", flush=True)

print(f"✅ Processing complete! New CSV file saved as: {OUTPUT_FILE}")


Processing Transcripts:  54%|█████████▋        | 87/162 [11:39<09:56,  7.95s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts:  59%|██████████▋       | 96/162 [13:10<10:14,  9.32s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts:  68%|███████████▌     | 110/162 [15:14<07:30,  8.67s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts:  75%|████████████▋    | 121/162 [16:38<04:59,  7.32s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts:  81%|█████████████▋   | 131/162 [17:49<03:54,  7.55s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts:  86%|██████████████▌  | 139/162 [18:40<02:55,  7.62s/it]

⚠️ Empty topic detected, retrying classification...


Processing Transcripts: 100%|█████████████████| 162/162 [21:01<00:00,  7.78s/it]

✅ Processing complete! New CSV file saved as: Processed_YouTube_Transcripts.csv



