# Imports

In [13]:
import sys
sys.path.insert(0, '..')
import os
import time
from dotenv import load_dotenv

import math
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from pydantic import BaseModel
from pydantic import BaseModel, Field, ValidationError
from typing import Optional, List, Callable, Tuple
from langchain_openai import ChatOpenAI


from config import stances
from DB_connection import DB_connection
from llm_labeling import VideoClassification, BatchClassification, build_prompt, query_llm, get_langchain_llm


load_dotenv()


OPEN_ROUTER_API_KEY = os.getenv("OPEN_ROUTER_API_KEY")
db_connection = DB_connection()

# 1. Functions

## 1.1. Create Tables

In [2]:
def create_compare_video_annotations_table():
    try:
        query = """
                SELECT * FROM compare_video_annotations
                """
        db_connection.select(query)
    except:
        query = """
                SELECT video_youtube_id,title,description,channel_title,transcript FROM annoted_videos 
                """
        videos = db_connection.select(query)
        videos = videos.sample(n=380, random_state=42).reset_index(drop=True)
        videos['true_label'] = None
        videos = videos.rename(columns={
                "channel_title": "channel_name"})
        
        

        annotated_videos = pd.read_csv('../data/annotated_videos.csv')
        annotated_videos = annotated_videos.rename(columns={
                "video_id":"video_youtube_id",
                "stance": "true_label",
                "transcript_text": "transcript",
        })
        annotated_videos = annotated_videos.drop(columns="stance_code")

        all_columns = set(videos.columns).union(set(annotated_videos.columns))
        videos = videos.reindex(columns=all_columns)
        annotated_videos = annotated_videos.reindex(columns=all_columns)

        all_videos = pd.concat([videos, annotated_videos], ignore_index=True)
        column_order = ["video_youtube_id", "title", "description", "channel_name", "transcript", "true_label"]
        all_videos = all_videos.reindex(columns=column_order)

        db_connection.save_df(all_videos, 'compare_video_annotations')
 

def create_compare_llms_tables():
    try:
        query = """
                SELECT * FROM llm_comparison
                """
        db_connection.select(query)
    except:
        query = """ CREATE TABLE llm_comparison(
                llm_name TEXT,
                llm_full_name TEXT,
                batch_size INT,
                accuracy FLOAT,
                precision FLOAT,
                recall FLOAT,
                running_time FLOAT,
                cost FLOAT
                )"""
        db_connection.create_table(query)

    
create_compare_video_annotations_table()
create_compare_llms_tables()

    


## 1.2. Open router costs

In [3]:
def get_usage_openrouter():
    resp = requests.get(
        "https://openrouter.ai/api/v1/credits",
        headers={"Authorization": f"Bearer {OPEN_ROUTER_API_KEY}"},
        timeout=30,
    )
    data = resp.json()["data"]
    return data["total_usage"]


## 1.3. LLM annotation

In [16]:
def ensure_unique_column( base_name: str) -> str:
    # Get existing columns
    existing = set(db_connection.select("PRAGMA table_info(compare_video_annotations);")["name"])
    if base_name not in existing:
        col = base_name
    else:
        # Find smallest suffix not yet used
        k = 1
        while f"{base_name}_v{k}" in existing:
            k += 1
        col = f"{base_name}_v{k}"
    # Add the column safely (quote the identifier)
    db_connection.execute_query(f'ALTER TABLE compare_video_annotations ADD COLUMN "{col}" TEXT')
    return col

def annotate_by_batch(llm_structured, column_name, batch_size=10):
    
    videos = db_connection.select("SELECT * FROM compare_video_annotations")
    

    for i in tqdm(range(batch_size, len(videos), batch_size)):
        batch = videos.iloc[i:i+batch_size]
        #print(batch)
        try:
            prompt = build_prompt(batch,True)
            response = query_llm(prompt,llm_structured)
        except:
            prompt = build_prompt(batch,False)
            response =query_llm(prompt,llm_structured)
        
        for item in response.items:
            #print(f"Video ID: {item.video_id}, Label: {item.label}")

            query = f"""
                UPDATE compare_video_annotations
                SET {column_name} = ? 
                WHERE video_youtube_id = ?;
            """    
            cursor = db_connection.connection.cursor()
            cursor.execute(query,(item.label, item.video_id))
        db_connection.connection.commit()


def get_accuracy(column_name):
    query = f""" SELECT * 
                 FROM compare_video_annotations 
                 WHERE true_label IS NOT NULL 
                 AND {column_name} IS NOT 'Undefined';"""
    df = db_connection.select(query)

    correct = (df[column_name] == df["true_label"]).sum()
    total = len(df)
    accuracy = correct / total

    return accuracy

    

def label_with_one_llm(full_model_name, column_name, batch_size=10):
    column_name = column_name +f"_batch_size_{batch_size}"
    column_name = ensure_unique_column(column_name)

    start_usage = get_usage_openrouter()

    start_time = time.time()
    llm = get_langchain_llm("openrouter", full_model_name)
    llm_structured = llm.with_structured_output(BatchClassification)
    annotate_by_batch(llm_structured,column_name,batch_size)
    end_time = time.time()
    running_time = end_time - start_time

    end_usage = get_usage_openrouter()
    cost = (end_usage - start_usage) 

    accuracy = get_accuracy(column_name)
    print(f"Accuracy for {full_model_name}: {accuracy:.2f}")
    query = f"""
            INSERT INTO llm_comparison (llm_name, llm_full_name, batch_size, accuracy, running_time, cost)
            VALUES (?, ?, ?, ?, ?, ?)
            """
    cursor = db_connection.connection.cursor()
    cursor.execute(query, (column_name, full_model_name, batch_size, accuracy, running_time, cost))
    db_connection.connection.commit()



#label_with_one_llm("meta-llama/llama-4-maverick:free","llama_maverick",10)

   


# 2. Analysis 

## 2.1. Compute LLM consitency

### 2.1.1. Pairwise Cohen's Kappa 

In [14]:
import itertools
import json
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.utils.multiclass import type_of_target

print("importation done")

annotators = [
    "mistralai_small_batch_size_10",
    "mistralai_small_batch_size_10_v1",
    "mistralai_small_batch_size_10_v2",
    "mistralai_small_batch_size_10_v3",
    "mistralai_small_batch_size_10_v4",
]

query = "SELECT * FROM compare_video_annotations"
annotations_df = db_connection.select(query)

def _parse_cell(v):
    """Return a single, stable categorical label as a string."""
    if pd.isna(v):
        return pd.NA

    # Already list/tuple/set -> stringify deterministically
    if isinstance(v, (list, tuple, set)):
        # If it’s a single-element list like ['A'], collapse to 'A'
        if len(v) == 1:
            return str(next(iter(v)))
        # Otherwise treat as a (multi)label string, still valid for kappa as a class
        return str(tuple(v))

    s = str(v).strip()
    if s == "":
        return pd.NA

    # Try to parse JSON-like strings
    if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
        try:
            parsed = json.loads(s)
            if isinstance(parsed, (list, tuple, set)):
                if len(parsed) == 1:
                    return str(parsed[0])
                return str(tuple(parsed))
            # Dicts -> stable string
            return str(parsed)
        except Exception:
            # fall back to raw string
            pass

    return s

def safe_kappa_pair(df, a, b, weights=None):
    sub = df[[a, b]].copy()

    # Normalize empties and drop rows where either side is missing
    sub.replace({"": pd.NA}, inplace=True)
    sub[a] = sub[a].map(_parse_cell)
    sub[b] = sub[b].map(_parse_cell)
    sub = sub.dropna(subset=[a, b])

    # Ensure plain strings (categorical-like)
    sub[a] = sub[a].astype(str)
    sub[b] = sub[b].astype(str)

    # Diagnostics to catch odd targets early
    ta = type_of_target(sub[a])
    tb = type_of_target(sub[b])
    if ta not in {"binary", "multiclass"} or tb not in {"binary", "multiclass"}:
        raise ValueError(
            f"type_of_target -> {a}: {ta}, {b}: {tb}. "
            "Expect 'binary' or 'multiclass' after cleaning."
        )

    # Compute kappa
    return cohen_kappa_score(sub[a], sub[b], weights=weights), sub

# If your labels are ordinal (e.g., 1–5 Likert), pick weights='linear' or 'quadratic'
WEIGHTS = None  # change to 'quadratic' for ordinal scales

pairs = itertools.combinations(annotators, 2)

results = []
for a, b in pairs:
    try:
        kappa, sub = safe_kappa_pair(annotations_df, a, b, weights=WEIGHTS)
        n = len(sub)
        results.append({"pair": (a, b), "kappa": kappa, "n": n})
        print(f"{a} vs {b}: κ = {kappa:.3f} (n={n})")
    except Exception as e:
        print(f"{a} vs {b}: FAILED -> {e}")

# Convert to DataFrame for convenience
kappa_df = pd.DataFrame(results)

# Simple unweighted mean
mean_kappa = kappa_df["kappa"].mean()

# Weighted mean (by number of overlapping items)
weighted_mean_kappa = np.average(kappa_df["kappa"], weights=kappa_df["n"])

print("\n--- Summary ---")
print(f"Unweighted average κ: {mean_kappa:.3f}")
print(f"Weighted average κ (by n): {weighted_mean_kappa:.3f}")


importation done
mistralai_small_batch_size_10 vs mistralai_small_batch_size_10_v1: κ = 0.887 (n=486)
mistralai_small_batch_size_10 vs mistralai_small_batch_size_10_v2: κ = 0.899 (n=486)
mistralai_small_batch_size_10 vs mistralai_small_batch_size_10_v3: κ = 0.875 (n=487)
mistralai_small_batch_size_10 vs mistralai_small_batch_size_10_v4: κ = 0.840 (n=486)
mistralai_small_batch_size_10_v1 vs mistralai_small_batch_size_10_v2: κ = 0.884 (n=485)
mistralai_small_batch_size_10_v1 vs mistralai_small_batch_size_10_v3: κ = 0.863 (n=487)
mistralai_small_batch_size_10_v1 vs mistralai_small_batch_size_10_v4: κ = 0.879 (n=486)
mistralai_small_batch_size_10_v2 vs mistralai_small_batch_size_10_v3: κ = 0.853 (n=486)
mistralai_small_batch_size_10_v2 vs mistralai_small_batch_size_10_v4: κ = 0.877 (n=485)
mistralai_small_batch_size_10_v3 vs mistralai_small_batch_size_10_v4: κ = 0.856 (n=488)

--- Summary ---
Unweighted average κ: 0.871
Weighted average κ (by n): 0.871


### 2.1.2 Fleiss' kappa

In [15]:


# === 1) Helpers to normalize cells (same idea we used for Cohen) ===
def _parse_cell(v):
    if pd.isna(v):
        return pd.NA
    if isinstance(v, (list, tuple, set)):
        if len(v) == 1:
            return str(next(iter(v)))
        return str(tuple(v))
    s = str(v).strip()
    if s == "":
        return pd.NA
    if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
        try:
            parsed = json.loads(s)
            if isinstance(parsed, (list, tuple, set)):
                if len(parsed) == 1:
                    return str(parsed[0])
                return str(tuple(parsed))
            return str(parsed)
        except Exception:
            pass
    return s

def _clean_panel(df, cols):
    sub = df[cols].copy()
    sub.replace({"": pd.NA}, inplace=True)
    for c in cols:
        sub[c] = sub[c].map(_parse_cell)
        sub[c] = sub[c].astype("string")
    # Fleiss requires the same number of ratings per item:
    sub = sub.dropna(subset=cols)   # keep only items all raters labeled
    return sub

# === 2) Build N x K matrix of category counts (each row sums to n_raters) ===
def _category_matrix(df, cols):
    # All categories across annotators
    cats = pd.Index(
        sorted(pd.unique(pd.concat([df[c] for c in cols]).dropna()))
    )
    # Count per item
    row_list = []
    for _, row in df[cols].iterrows():
        counts = pd.value_counts(row.values, sort=False)
        counts = counts.reindex(cats, fill_value=0)
        row_list.append(counts.values)
    M = np.vstack(row_list)  # shape: (N_items, K_categories)
    return M, cats

# === 3) Fleiss' kappa ===
def fleiss_kappa_from_df(df, annotator_cols):
    sub = _clean_panel(df, annotator_cols)
    if sub.empty:
        raise ValueError("No rows left after dropping missing labels across annotators.")
    M, cats = _category_matrix(sub, annotator_cols)

    N, K = M.shape        # N items, K categories
    n = M.sum(axis=1)     # number of raters per item (should be constant)
    if not np.all(n == n[0]):
        raise ValueError("Fleiss’ kappa requires the same number of ratings per item.")
    n = n[0]

    # Per-item agreement
    P_i = ( (M * (M - 1)).sum(axis=1) ) / ( n * (n - 1) )
    P_bar = P_i.mean()

    # Category proportions across all ratings
    p_j = M.sum(axis=0) / (N * n)
    P_e = (p_j ** 2).sum()

    kappa = (P_bar - P_e) / (1 - P_e) if 1 - P_e != 0 else np.nan

    return {
        "kappa": float(kappa),
        "P_bar": float(P_bar),
        "P_e": float(P_e),
        "n_items": int(N),
        "n_raters": int(n),
        "n_categories": int(K),
        "categories": list(cats.astype(str)),
    }

# === 4) Use with your data ===
annotators = [
    "mistralai_small_batch_size_10",
    "mistralai_small_batch_size_10_v1",
    "mistralai_small_batch_size_10_v2",
    "mistralai_small_batch_size_10_v3",
    "mistralai_small_batch_size_10_v4",
]

result = fleiss_kappa_from_df(annotations_df, annotators)
print(
    f"Fleiss' κ = {result['kappa']:.3f} | "
    f"P̄={result['P_bar']:.3f}, P_e={result['P_e']:.3f} | "
    f"N={result['n_items']}, raters={result['n_raters']}, K={result['n_categories']}"
)


Fleiss' κ = 0.872 | P̄=0.907, P_e=0.271 | N=482, raters=5, K=5


  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.values, sort=False)
  counts = pd.value_counts(row.