## Thesis Relevant Comments Step 2

After the first filtering step, all comments marked as **“not relevant”** were rechecked to ensure no potentially useful comments were missed.

This time, each comment was reclassified into one of three categories:

- **YES** – Clearly relevant to sportswashing  
- **MAYBE** – Ambiguous but possibly relevant  
- **DEFINITLY NOT** – Still not relevant  

This extra step helps catch borderline or nuanced comments that might still offer value for understanding how sportswashing narratives are discussed, especially when initial filters were too strict or missed context.

Below is the extraction of each batch processed from the initial stage with the Yes and No counts

### Importing Libraries

In [None]:
import json
import pandas as pd
import openai
import os
import time
from tqdm import tqdm
import math
import torch
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from transformers import AutoTokenizer

## Creating json files for the no responses and resubmitting

In [None]:
# Setting OpenAI API Key
openai.api_key = "**************" 
client = openai.OpenAI(api_key=openai.api_key)

# Loading in the CSV File
input_csv = "YouTube_Comments_No.csv"
df = pd.read_csv(input_csv)

# Ensuring Rewritten Comment column exists
if "Rewritten Comment" not in df.columns:
    raise KeyError("Column 'Rewritten Comment' not found in CSV.")

# Splitting into 2 equal parts
split_idx = math.ceil(len(df) / 2)
df_batches = [df.iloc[:split_idx], df.iloc[split_idx:]]

# Output JSONL file names
batch_files = ["batch_check_no_comments_1.jsonl", "batch_check_no_comments_2.jsonl"]


### Define sportswashing prompt & create JSONL batch files (YES/MAYBE/DEFINITELY NOT)

In [15]:
# Defining the prompt function
def generate_prompt(comment):
    return f"""
**Definition of Sportswashing:**  
- When sports are used to improve a country’s reputation while hiding **human rights abuses, corruption, or political issues**.
- Example: **Gulf states** (Saudi Arabia, Qatar, UAE) investing in sports, hosting events (FIFA, F1), or owning clubs (Man City, PSG, Newcastle).  

**Classification Rules:**  
- `YES`: Mentions sportswashing, Gulf investments, corruption, political influence, financial takeovers, or criticism/support of Gulf involvement.  
- `MAYBE`: Unclear connection but mentions Middle Eastern entities, Gulf countries, or potential geopolitical influence.  
- `DEFINITELY NOT`: Clearly about **match performance, goals, players, or unrelated topics** (e.g., "That was a great goal!", "This team played well").  

**YouTube Comment (Rewritten for Clarity):**  
\"{comment}\"  

**Instructions:**  
- Respond **ONLY** with `YES`, `MAYBE`, or `DEFINITELY NOT`, nothing else.  
- If uncertain, **lean toward `YES`** if the comment references any political, financial, or ethical aspect of sports.  
"""

# Creating JSONL batch files in OpenAI's batch format
for batch_idx, (batch_df, batch_file) in enumerate(zip(df_batches, batch_files), start=1):
    with open(batch_file, "w", encoding="utf-8") as f:
        for _, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Writing {batch_file}", unit="comment"):
            task = {
                "custom_id": str(row["Comment_ID"]),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",  
                    "messages": [
                        {"role": "system", "content": "You are verifying if YouTube comments are actually unrelated to sportswashing and Gulf influence in sports."},
                        {"role": "user", "content": generate_prompt(row["Rewritten Comment"])}
                    ],
                    "temperature": 0,  
                    "max_tokens": 5  
                }
            }
            f.write(json.dumps(task) + "\n")
    print(f"Batch JSONL file '{batch_file}' created successfully!")


Batch JSONL file 'batch_check_no_comments_1.jsonl' created successfully!
Batch JSONL file 'batch_check_no_comments_2.jsonl' created successfully!


### Submit first batch now, wait 5 hours, then submit second batch (OpenAI Batch API)

In [6]:
# Submitting the first batch immediately
first_batch_file = batch_files[0]
if os.path.exists(first_batch_file):
    try:
        print(f"\nSubmitting first batch file: {first_batch_file}")
        with open(first_batch_file, "rb") as f:
            batch_file_upload = client.files.create(file=f, purpose="batch")
        batch_job = client.batches.create(
            input_file_id=batch_file_upload.id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        print(f"First batch submitted successfully! Job ID: {batch_job.id}")
    except openai.OpenAIError as e:
        print(f"OpenAI API error submitting first batch: {e}")
    except Exception as e:
        print(f"General error submitting first batch: {e}")

# Waiting 5 hours before submitting the second batch
time.sleep(5 * 60 * 60)  # 5 hours in seconds

# Submitting the second batch
second_batch_file = batch_files[1]
if os.path.exists(second_batch_file):
    try:
        print(f"\nSubmitting second batch file: {second_batch_file}")
        with open(second_batch_file, "rb") as f:
            batch_file_upload = client.files.create(file=f, purpose="batch")
        batch_job = client.batches.create(
            input_file_id=batch_file_upload.id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        print(f"Second batch submitted successfully! Job ID: {batch_job.id}")
    except openai.OpenAIError as e:
        print(f"OpenAI API error submitting second batch: {e}")
    except Exception as e:
        print(f"General error submitting second batch: {e}")


Writing batch_check_no_comments_1.jsonl: 100%|█| 42130/42130 [00:05<00:00, 72]
Batch JSONL file 'batch_check_no_comments_1.jsonl' created successfully!
Writing batch_check_no_comments_2.jsonl: 100%|█| 42129/42129 [00:04<00:00, 94]
Batch JSONL file 'batch_check_no_comments_2.jsonl' created successfully!

Submitting first batch file: batch_check_no_comments_1.jsonl
First batch submitted successfully! Job ID: batch_67d754de87e88190bbb088b9e4bbc0d8

Submitting second batch file: batch_check_no_comments_2.jsonl
Second batch submitted successfully! Job ID: batch_67d79b46760c8190831ff32f9d5baf16


### Count of Yes/no comments from Attempt 2

In [62]:
df = pd.read_csv("responses_yes.csv")

# Count YES responses
yes_count = (df['response'] == 'YES').sum()

print(f"Number of 'YES' responses: {yes_count}")

Number of 'YES' responses: 9051


### Split 'Maybe' and 'Definitely Not' Comments for Further Filtering


In [11]:
# Loading the Data
input_csv = "final_maybe_definitely_not.csv"
df = pd.read_csv(input_csv)

# Separating the Data by Response Column
df_maybe = df[df["response"].str.lower() == "maybe"].copy()
df_definitely_not = df[df["response"].str.lower() == "definitely not"].copy()

# Saving as separate CSV files
df_maybe.to_csv("comments_maybe.csv", index=False)
df_definitely_not.to_csv("comments_definitely_not.csv", index=False)

print("CSV files created: comments_maybe.csv & comments_definitely_not.csv")

CSV files created: comments_maybe.csv & comments_definitely_not.csv


Compare 'Maybe' Comments to Transcripts Using SBERT

This step checks whether “maybe” comments are actually relevant by comparing them to their associated video transcripts.

- Loads the `comments_maybe.csv` and transcript file.
- Encodes both the comment and transcript sentences using **SBERT**.
- Finds the **top 5 most similar transcript sentences** for each comment based on cosine similarity.
- Sends this along with the comment to **GPT-4o**, asking whether the comment is relevant based on transcript context and predefined criteria.
- Outputs a `batch_maybe_transcript_comparison.jsonl` file, ready to be submitted to the OpenAI batch endpoint.


### Load comments & transcripts, initialize SBERT, and validate schema

In [21]:
# Loading Data
input_csv = "comments_maybe.csv"
df = pd.read_csv(input_csv)
print(f"Loaded comments file: {input_csv} ({len(df)} rows)")

transcript_csv = "Processed_YouTube_Transcripts.csv"
df_transcripts = pd.read_csv(transcript_csv)
print(f"Loaded transcripts file: {transcript_csv} ({len(df_transcripts)} rows)")

# Loading the SBERT Model
print("Loading SBERT model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("SBERT model loaded successfully.")

# Ensuring the Required Columns Exists
required_columns = ["custom_id", "Rewritten Comment", "Video_ID"]
if not all(col in df.columns for col in required_columns):
    raise KeyError(f" Missing required columns in comments CSV: {required_columns}")

if "Video_ID" not in df_transcripts.columns or "Transcript" not in df_transcripts.columns:
    raise KeyError(" Required columns ('Video_ID', 'Transcript') missing from transcript CSV.")

print("Column validation completed successfully.")


Loaded comments file: comments_maybe.csv (21031 rows)
Loaded transcripts file: Processed_YouTube_Transcripts.csv (162 rows)
Loading SBERT model...
SBERT model loaded successfully.
Column validation completed successfully.


### Compute comment & transcript embeddings, then define top-5 transcript similarity helper

In [23]:
# Precompute SBERT Embeddings for All Comments
print("Encoding all comments in batch...")
comment_embeddings = model.encode(df["Rewritten Comment"].tolist(), convert_to_tensor=True)
df["comment_embedding"] = list(comment_embeddings)  # Storing embeddings in DataFrame
print("Comment embeddings computed successfully.")

# Precompute Transcript Embeddings
transcript_embeddings = {}
print("Precomputing transcript embeddings...")
for _, row in tqdm(df_transcripts.iterrows(), total=len(df_transcripts), desc="Processing Transcripts"):
    video_id = row["Video_ID"]
    transcript_text = row["Transcript"]
    sentences = re.split(r'(?<=[.!?])\s+', transcript_text)
    
    top_k = min(5, len(sentences))  # Always select 5 sentences, or fewer if transcript is shorter
    
    if len(sentences) == 0:
        transcript_embeddings[video_id] = (["No meaningful sentences"] * top_k, torch.tensor([0.0]))
    else:
        transcript_embeddings[video_id] = (sentences, model.encode(sentences, convert_to_tensor=True))
print("Transcript embeddings computed successfully.")

# Function to Get Top 5 Similar Sentences
def get_top_similar_sentences(comment_embedding, video_id):
    if video_id not in transcript_embeddings:
        return ["No transcript found"] * 5, 0.0  # Ensure at least 5 elements

    transcript_sentences, transcript_embedding = transcript_embeddings[video_id]
    similarities = util.pytorch_cos_sim(comment_embedding, transcript_embedding)[0]
    max_similarity = torch.max(similarities).item()
    
    top_n = min(5, len(transcript_sentences))  # Always use 5 sentences
    top_indices = torch.topk(similarities, k=top_n).indices
    top_sentences = [transcript_sentences[i] for i in top_indices]

    while len(top_sentences) < 5:
        top_sentences.append("N/A")

    return top_sentences, max_similarity


Encoding all comments in batch...
Precomputing transcript embeddings...
Processing Transcripts: 100%|█████████| 162/162 [00:47<00:00, 3.38it/s]


### Generate JSONL batch for comment–transcript relevance classification

In [25]:
# Process Comments
output_jsonl = "batch_maybe_transcript_comparison.jsonl"
print("Processing comments...")

with open(output_jsonl, "w", encoding="utf-8") as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
        comment_id = str(row["custom_id"])
        rewritten_comment = row["Rewritten Comment"]
        video_id = row["Video_ID"]
        comment_embedding = row["comment_embedding"]

        top_sentences, _ = get_top_similar_sentences(comment_embedding, video_id)

        task = {
            "custom_id": comment_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {"role": "system", "content": "You are analyzing YouTube comments by comparing them to their video transcripts. Your task is to determine whether the transcript provides meaningful context for the comment."},
                    {"role": "user", "content": f"""
**Comment:** "{rewritten_comment}"

**Top 5 Transcript Sentences:**
""" + "\n".join([f"- {sentence}" for sentence in top_sentences]) + """

### Task:
Determine whether this comment is relevant to discussions about sportswashing, human rights, financial ethics, corruption, or geopolitical motives.
Additionally, if the comment makes a **positive or negative statement, opinion, or fact** about the **Middle East**, then it is considered relevant.

### Instructions:
- Respond **"YES"** if the comment relates to **any** of the above topics or expresses a **positive/negative statement about the Middle East**.
- Respond **"NO"** if the comment is **only about match performance, players, goals, or unrelated topics**.

### **Final Response Format:**  
Respond **ONLY** with "YES" or "NO", nothing else.
"""}
                ],
                "temperature": 0.0,
                "max_tokens": 5
            }
        }
        f.write(json.dumps(task) + "\n")

print(f"JSONL file created: {output_jsonl}")


Processing comments...
Processing comments: 100%|█████████| 21031/21031 [00:10<00:00, 2075.06it/s]
JSONL file created: batch_maybe_transcript_comparison.json


### Upload JSONL batch to OpenAI and submit job

In [27]:
openai.api_key = "******************"  
client = openai.OpenAI(api_key=openai.api_key)

# Path to Batch JSONL File
batch_file = "batch_maybe_transcript_comparison.jsonl"

# Uploading & Submitting the Batch
if not os.path.exists(batch_file):
    raise FileNotFoundError(f" {batch_file} not found. Check the file path.")

try:
    print(f"\n Uploading batch file: {batch_file}")

    # Uploading the JSONL batch file
    file_size = os.path.getsize(batch_file)
    with tqdm(total=file_size, unit="B", unit_scale=True, desc=f" Uploading {batch_file}") as pbar:
        with open(batch_file, "rb") as f:
            batch_file_upload = client.files.create(file=f, purpose="batch")
            pbar.update(file_size)  # Update progress bar when upload completes

    print(f" {batch_file} uploaded successfully. File ID: {batch_file_upload.id}")

    # Submitting Batch Job to OpenAI
    batch_job = client.batches.create(
        input_file_id=batch_file_upload.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"  
    )

    print(f" Batch submitted successfully! Job ID: {batch_job.id}")

except Exception as e:
    print(f" Error submitting batch: {e}")


Uploading batch file: batch_maybe_transcript_comparison.jsonl
Uploading batch_maybe_transcript_comparison.jsonl: 100%|██████████| 114M/114M [0:00:00]
batch_maybe_transcript_comparison.jsonl uploaded successfully. File ID: file-8VLhAMWoB3M6LDL9EvUY79
Batch submitted successfully! Job ID: batch_67d9fa2dacd881909410aeb7f8ddf162


### Download batch output files from OpenAI and save as JSONL

In [19]:
openai.api_key = "******************"  

# List of output file IDs from completed batches
output_file_ids = ["file-FANQa3jqFsxWTQPsQYiHXN",
]

# Download each output file properly
for file_id in output_file_ids:
    file_response = openai.files.content(file_id)

    # Saving the file locally in binary mode
    output_filename = f"{file_id}.jsonl"
    with open(output_filename, "wb") as f:
        for chunk in file_response.iter_bytes():
            f.write(chunk)
    
    print(f" File downloaded: {output_filename}")

File downloaded: file-FANQa3jqFsxWTQPsQYiHXN.jsonl


### Count of Yes/No Responses

In [45]:
# List of JSONL files
input_files = [
    "file-FANQa3jqFsxWTQPsQYiHXN.jsonl"
]

# Reading and combine all files into one DataFrame
data = []
for file_name in input_files:
    with open(file_name, "r", encoding="utf-8") as f:
        data.extend([
            {
                "custom_id": obj.get("custom_id", ""),
                "response": obj.get("response", {}).get("body", {}).get("choices", [{}])[0].get("message", {}).get("content", "").strip()
            }
            for obj in map(json.loads, f)
        ])

df = pd.DataFrame(data)

# Count YES vs NO
print(f"Number of 'YES' responses: {(df['response'] == 'YES').sum()}")
print(f"Number of 'NO' responses: {(df['response'] == 'NO').sum()}")

Number of 'YES' responses: 17824
Number of blank responses: 3207
