Referenced papers:
- https://arxiv.org/pdf/2203.11171 (original self consistency paper)
- https://arxiv.org/pdf/2311.17311 (universal self consistency)

General Idea:
- Sample directly from the decoder instead of greedy decoding
- Create a bunch of different responses, find the most consistent result
- Return the most consistent result

Authors of the original paper use PaLM-540B

In [None]:
!pip install datasets
!pip install transformers
!pip install --upgrade transformers
!pip install einops
!pip install evaluate bert_score rouge-score
!pip install -U datasets huggingface_hub fsspec
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install scikit-learn
!pip install together

In [None]:
%%bash
export TOGETHER_API_KEY=!CHECK SLACK

In [None]:
from datasets import load_dataset
import json

In [None]:
train_dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train")
test_dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="test")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# OPENROUTER
import requests

# API_KEY = CHECK SLACK
# REFERER = "https://colab.research.google.com"

# MODEL = "moonshotai/kimi-k2:free"
# MAX_TOKENS = 400

# # OpenRouter API endpoint
# url = "https://openrouter.ai/api/v1/chat/completions"

# # Headers required by OpenRouter
# headers = {
#     "Authorization": f"Bearer {API_KEY}",
#     "HTTP-Referer": REFERER,
#     "Content-Type": "application/json"
# }

# TOGETHER.AI
from together import Together
client = Together()

MODEL = "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"

In [None]:
import evaluate

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [None]:
def evaluate_summary(gen: str, ref: str, source: str = None):
  """
  Takes:
    - gen: the models generated summary
    - ref: the human (reference) summary
    - source: the source text used to generate the summary
  Returns: a dict of metric_name -> score
  """

  results = {}


  # Rouge
  scores = rouge.compute(predictions=[gen], references=[ref], use_stemmer=True)
  # Rouge.compute returns keys like rouge1 and rouge2 each as a dict ofprecision/recall/fmeasure
  # What is the v.mid.fmeasure for? The attribute doesn't exist for the given value (as it is a numpy float)

  # Bert score
  bs = bertscore.compute(predictions=[gen], references=[ref], lang="en")

  # bert score returns P, F, R Lists

  results["bertscore_f1"] = bs["f1"][0]

  # compression ratio (optional)
  if source:
    results["compression"] = len(gen.split()) / len(source.split())

  # Return ROUGE results + f1 scores
  results.update(**scores)
  return results

Prompts from https://arxiv.org/pdf/2407.21443

In [None]:
# Specify the labels for:
# (1) The text to summarize
# (2) The summarization of the text given by the dataset
# Ex. For dailymail, the text to summarize is given as the label "article", while the summarization is given as "highlights".
text_to_summarize_label = "article"
summarization_label = "highlights"

In [None]:
cot = ""

# No chain of thought for now
# for i in train_dataloader.dataset[len(train_dataloader.dataset)-3:len(train_dataloader.dataset)][summarization_label]:
#   cot += "Response: " + i + "\n"

cot += "You are an expert summarizer.\n"
cot += "Summarize the following text step by step.\n"
cot += "First, list the key points of the text. Then provide a concise summary.\n"
cot += "Start your response with 'Response: ' (without quotes)."
def append_prompt(cot_to_append, prompt):
  return cot_to_append + "\n" + prompt + "\n"

In [None]:
# adjective sets
EXTRACTION_TYPES_TEXT_SUMM  = [ "insightful", "clear", "concise", "accurate",
                            "coherent", "persuasive", "thorough", "engaging",
                             "relevant", "structured", "nuanced", "well-supported",
                             "articulate", "convincing", "sophisticated", "balanced"]
# Decide scoring priority for each adjective here.
# Ex. If you want a consise response, you might want to set "conciseness": MAX.
# If you have something a bit more ambiguous, you might want to set the scores to different priorities (ex. "conciseness": MEDIUM, "detailedness": SMALL, "clarity": HIGH)
NONE = 0
LOW = 0.25
MEDIUM = 0.50
HIGH = 0.75
MAX = 1

# EXTRACTION_TYPES_TEXT_SUMM_SCORES = {
#     "insightful": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "clear": {"conciseness": HIGH, "detailedness": LOW, "clarity": MAX},
#     "concise": {"conciseness": MAX, "detailedness": NONE, "clarity": NONE},
#     "accurate": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "coherent": {"conciseness": MEDIUM, "detailedness": MEDIUM, "clarity": HIGH},
#     "persuasive": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "thorough": {"conciseness": NONE, "detailedness": MAX, "clarity": MEDIUM},
#     "engaging": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "relevant": {"conciseness": HIGH, "detailedness": MEDIUM, "clarity": MEDIUM},
#     "structured": {"conciseness": MEDIUM, "detailedness": LOW, "clarity": HIGH},
#     "nuanced": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "well-supported": {"conciseness": LOW, "detailedness": MAX, "clarity": MEDIUM},
#     "articulate": {"conciseness": MEDIUM, "detailedness": MEDIUM, "clarity": HIGH},
#     "convincing": {"conciseness": MEDIUM, "detailedness": HIGH, "clarity": MEDIUM},
#     "sophisticated": {"conciseness": LOW, "detailedness": HIGH, "clarity": MEDIUM},
#     "balanced": {"conciseness": MEDIUM, "detailedness": HIGH, "clarity": MEDIUM}
# }

EXTRACTION_TYPES_TEXT_SUMM_SCORES = {"detailed": {"conciseness": NONE, "detailedness": MAX, "clarity": NONE},
                                     "concise": {"conciseness": MAX, "detailedness": NONE, "clarity": NONE},
                                     "clear": {"conciseness": NONE, "detailedness": NONE, "clarity": MAX}}

# split into four tiers
EXTRACTION_SETS = {
    8:  EXTRACTION_TYPES_TEXT_SUMM[:8],
    16: EXTRACTION_TYPES_TEXT_SUMM[:16],
    24: EXTRACTION_TYPES_TEXT_SUMM[:24],
    32: EXTRACTION_TYPES_TEXT_SUMM[:32],
}

# Example usage:
for size, extraction_type in EXTRACTION_SETS.items():
    print(f"{size}-word set:", extraction_type)

In [None]:
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_sm")

# -----------------------------
# Conciseness Scorer (softer penalties)
# -----------------------------
def conciseness_score(cand: str, pool: list) -> float:
    if not cand or not cand.strip():
        return 0.0
    try:
        doc = nlp(cand)
        total_tokens = len([t for t in doc if not t.is_space])
        content_words = [t for t in doc if t.pos_ in {'NOUN','VERB','ADJ','ADV'} and not t.is_stop]
        density = len(content_words) / max(1, total_tokens)

        baseline = 0.4  # average expected density
        density_score = 1.0 - min(1.0, density / (baseline * 1.3))  # softer scaling

        fillers = [t for t in doc if t.lower_ in {"basically","actually","kind of","sort of","really","very"}]
        filler_penalty = min(0.2, len(fillers) / max(1, total_tokens))  # reduced weight

        redundancy_penalty = 0.0
        if len(cand) > 20:
            words = [t.lower_ for t in doc if not t.is_stop and t.is_alpha]
            unique_ratio = len(set(words)) / max(1, len(words))
            redundancy_penalty = max(0.0, 0.3 - unique_ratio)

        final_score = (
            0.5 * density_score +
            0.2 * filler_penalty +
            0.3 * redundancy_penalty
        )
        return round(max(0.0, min(1.0, final_score)), 4)
    except Exception:
        return min(1.0, len(cand.split()) / 80)

# -----------------------------
# Clarity Scorer (less dominant)
# -----------------------------
def clarity_score(cand: str, pool: list) -> float:
    if not cand or not cand.strip():
        return 0.0
    try:
        docs = [cand] + pool
        tfidf = TfidfVectorizer(stop_words="english")
        matrix = tfidf.fit_transform(docs)
        sims = cosine_similarity(matrix[0:1], matrix[1:]).flatten()
        lexical_coherence = float(np.mean(sims)) if len(sims) > 0 else 0.0

        doc = nlp(cand)
        num_tokens = len([t for t in doc if not t.is_space])
        num_sentences = len(list(doc.sents))
        sent_len_var = 0.0
        if num_sentences > 1:
            lengths = [len([t for t in sent if not t.is_space]) for sent in doc.sents]
            sent_len_var = np.var(lengths) / (np.mean(lengths) + 1e-6)

        grammar_penalty = min(0.2, sent_len_var / 10.0)
        final_score = (
            0.6 * lexical_coherence +
            0.2 * (1 - grammar_penalty) +
            0.2 * (1.0 if num_tokens < 80 else 0.9)
        )
        # scale down clarity so it doesn’t dominate
        return round(min(1.0, final_score * 1), 4)
    except Exception:
        return 0.5

# -----------------------------
# Detail Scorer (boosted + softer normalization)
# -----------------------------
def detail_score(cand: str, pool: list) -> float:
    if not cand or not cand.strip():
        return 0.0
    try:
        doc = nlp(cand)
        content_words = [t for t in doc if t.pos_ in {'NOUN','VERB','ADJ','ADV'} and not t.is_stop]
        total_tokens = len([t for t in doc if not t.is_space])
        density = len(content_words) / max(1, total_tokens)

        entities = len(doc.ents)
        numbers = len([t for t in doc if t.like_num])
        detail_indicators = (entities + numbers) / max(1, math.sqrt(total_tokens))  # softer denom

        sentences = list(doc.sents)
        structure_score = 0.0
        if sentences:
            avg_sent_complexity = np.mean([len([t for t in sent if not t.is_space]) for sent in sentences])
            structure_score = min(1.0, (len(sentences) * avg_sent_complexity) / 80)

        length_contrib = min(0.3, math.log1p(len(doc)) / 20)

        final_score = (
            0.4 * density +
            0.3 * detail_indicators +
            0.2 * structure_score +
            0.1 * length_contrib
        )
        return min(1.0, round(final_score * 1.0, 4))  # amplify detail
    except Exception:
        return min(1.0, len(cand.split()) / 80)

# -----------------------------
# Utility for normalization
# -----------------------------
def _rescale_scores(scores):
    vals = np.array(list(scores.values()))
    vmin, vmax = vals.min(), vals.max()
    if vmax - vmin == 0:
        return {k: 0.5 for k in scores}  # flat if no variance
    return {k: (v - vmin) / (vmax - vmin) for k, v in scores.items()}

# -----------------------------
# Selection Function (fixed weights)
# -----------------------------
def select_best_extraction_type(answer, reference_summaries=None):
    scores = {
        "detailedness": detail_score(answer, reference_summaries or []),
        "clarity": clarity_score(answer, reference_summaries or []),
        "conciseness": conciseness_score(answer, reference_summaries or [])
    }

    # normalize per-answer so one metric doesn’t dominate
    scores = _rescale_scores(scores)

    DIM_WEIGHTS = {"detailedness": 0.6, "clarity": 1.0, "conciseness": 1.0}

    extraction_type_scores = {}

    for adj_type, target_scores in EXTRACTION_TYPES_TEXT_SUMM_SCORES.items():
        total_similarity = 0
        total_weight = 0

        for dimension, target_value in target_scores.items():
            if dimension in scores:
                actual_value = scores[dimension]
                deviation = abs(target_value - actual_value)
                weight = DIM_WEIGHTS.get(dimension, 1.0)
                print(weight)

                similarity = math.exp(-2.0 * deviation**2)

                total_similarity += weight * similarity
                total_weight += weight

        extraction_type_scores[adj_type] = (
            total_similarity / total_weight if total_weight > 0 else 0
        )

    best_type = max(extraction_type_scores.items(), key=lambda x: x[1])
    return best_type[0], extraction_type_scores

In [None]:
## IMPROVED SCORING FUNCTIONS
import math
import gzip
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import numpy as np
import json

nlp = spacy.load("en_core_web_sm")

_FILLER_PATTERNS = [
    r'\bas a matter of fact\b', r'\bin order to\b', r'\bas well as\b',
    r'\bat the end of the day\b', r'\bin the event that\b', r'\bthe fact that\b',
    r'\bjust to be clear\b', r'\bkind of\b', r'\bsort of\b', r'\bvery important\b',
    r'\breally\b', r'\bactually\b', r'\bperhaps\b', r'\bpretty much\b', r'\bessentially\b'
]

def _count_fillers(text: str) -> int:
    lowered = text.lower()
    return sum(len(re.findall(p, lowered)) for p in _FILLER_PATTERNS)

def _compressed_size(text: str) -> int:
    return len(gzip.compress(text.encode('utf-8')))

def conciseness_score(cand: str, pool: list) -> float:
    """Improved conciseness scoring with better normalization"""
    if not cand or not cand.strip():
        return 0.0

    tokens = cand.split()
    length = len(tokens)

    if length == 0:
        return 0.0

    comp_size = _compressed_size(cand)
    info_density = comp_size / length

    # Use pool statistics for better baseline if available
    if pool and len(pool) > 0:
        valid_pool = [str(p) for p in pool if p and str(p).strip()]
        if valid_pool:
            pool_densities = [_compressed_size(p) / max(1, len(p.split())) for p in valid_pool]
            baseline = np.mean(pool_densities) if pool_densities else 10.0
        else:
            baseline = 10.0
    else:
        baseline = 10.0

    density_score = 1.0 - min(1.0, info_density / baseline)

    # Filler penalty
    filler_count = _count_fillers(cand)
    filler_ratio = filler_count / max(1, length / 5)
    filler_penalty = min(1.0, filler_ratio)

    score = density_score * (1 - 0.3 * filler_penalty)  # Reduced penalty weight
    return max(0.0, min(1.0, round(score, 4)))  # Ensure score is in [0,1]


def clarity_score(cand: str, pool: list) -> float:
    """
    FIXED: Improved clarity scoring that actually works
    """
    if not cand or not cand.strip():
        return 0.0

    # Handle empty pool case
    valid_pool = [str(p) for p in pool if p and str(p).strip()]

    if not valid_pool:
        # Fallback to readability-based scoring
        return _readability_score(cand)

    try:
        # Create documents list
        docs = valid_pool + [cand]

        # Use TF-IDF with better parameters (FIXED the typo: min_df not mind_df)
        vectorizer = TfidfVectorizer(
            stop_words="english",
            min_df=1,
            max_features=1000,
            ngram_range=(1, 2)  # Include bigrams for better context
        )

        X = vectorizer.fit_transform(docs)

        if X.shape[0] <= 1:
            return _readability_score(cand)

        # Get candidate vector (last one)
        cand_vec = X[-1]
        pool_vecs = X[:-1]

        # Calculate similarity to pool (mean of similarities to each reference)
        similarities = []
        for i in range(pool_vecs.shape[0]):
            sim = cosine_similarity(cand_vec, pool_vecs[i:i+1])[0, 0]
            similarities.append(sim)

        avg_similarity = np.mean(similarities) if similarities else 0.0

        # Gentler length penalty
        length = len(cand.split())
        length_penalty = 1 / (1 + 0.05 * math.log1p(length))  # Much gentler penalty

        final_score = avg_similarity * (0.7 + 0.3 * length_penalty)  # Weighted combination

        return max(0.0, min(1.0, round(final_score, 4)))

    except Exception as e:
        print(f"Error in clarity_score: {e}")
        return _readability_score(cand)

def _readability_score(text: str) -> float:
    """Fallback readability score based on sentence structure"""
    if not text.strip():
        return 0.0

    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not sentences:
        return 0.0

    # Average sentence length (penalize very long or very short sentences)
    avg_sent_len = np.mean([len(s.split()) for s in sentences])
    length_score = 1.0 / (1.0 + abs(avg_sent_len - 15) / 10)  # Optimal around 15 words

    # Vocabulary diversity
    words = text.lower().split()
    unique_words = len(set(words))
    diversity_score = min(1.0, unique_words / max(1, len(words)))

    return round(0.6 * length_score + 0.4 * diversity_score, 4)

def detail_score(cand: str, pool: list) -> float:
    """
    IMPROVED: Detail scoring with better handling and normalization
    """
    if not cand or not cand.strip():
        return 0.0

    try:
        doc = nlp(cand)

        # Content word density (improved)
        content_words = [t for t in doc if t.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'} and not t.is_stop]
        total_tokens = len([t for t in doc if not t.is_space])
        density = len(content_words) / max(1, total_tokens)

        # Named entities and numbers (indicators of specific details)
        entities = len(doc.ents)
        numbers = len([t for t in doc if t.like_num])
        detail_indicators = (entities + numbers) / max(1, total_tokens)

        # Structural complexity
        sentences = list(doc.sents)
        num_sentences = len(sentences)
        if num_sentences > 0:
            avg_sent_complexity = np.mean([len([t for t in sent if not t.is_space]) for sent in sentences])
            structure_score = min(1.0, (num_sentences * avg_sent_complexity) / 100)
        else:
            structure_score = 0.0

        # Length contribution (but capped and gentler)
        length = len(doc)
        length_contrib = min(0.3, math.log1p(length) / 50)

        final_score = 0.4 * density + 0.3 * detail_indicators + 0.2 * structure_score + 0.1 * length_contrib

        return max(0.0, min(1.0, round(final_score, 4)))

    except Exception as e:
        print(f"Error in detail_score: {e}")
        # Fallback to simpler scoring
        words = cand.split()
        return min(1.0, len(words) / 100)  # Simple length-based fallback

def score_answer(answer, reference_summaries=None, llm_judge=False):
    """
    IMPROVED: Better scoring with proper reference handling
    """
    # Handle the case where reference_summaries might be a dataset slice
    ref_pool = []
    if reference_summaries is not None:
        if hasattr(reference_summaries, '__iter__') and not isinstance(reference_summaries, str):
            # It's likely a dataset slice or list
            ref_pool = [str(s) for s in reference_summaries if s and str(s).strip()]
        else:
            # It's a single string
            ref_pool = [str(reference_summaries)] if str(reference_summaries).strip() else []
    
    max_retries = 20
    c = 0
    if llm_judge:
        while c <= max_retries:
            score_prompt = (
                f"{answer}\n"
                "For the response above, grade the response on three criteria:\n1. Detailedness\n2. Clarity\n3. Conciseness\n"
                "Grade on a scale from 0 to 1.0, where 0 is the worst and 1.0 is the best. You may use up to 3 decimal points when scoring your answer."
                "After grading the response, ONLY respond the following format: {\"detailedness\": DETAILEDNESS_SCORE, \"clarity\": CLARITY_SCORE, \"conciseness\": CONCISENESS_SCORE}"
            )

            response = client.chat.completions.create( 
            model=MODEL,
            messages=[
                {
                "role": "user",
                "content": score_prompt
                }
            ],
            temperature=0
            )
            
            try:
                scores = json.loads(response.choices[0].message.content)
            except Exception as e:
                print(f"{e}, trying again...")
                c += 1
                continue
            else:
                break
    else:
        concise_score = conciseness_score(answer, ref_pool)
        clear_score = clarity_score(answer, ref_pool)  # Now uses the improved version
        det_score = detail_score(answer, ref_pool)

        scores = {
            "conciseness": concise_score,
            "clarity": clear_score,
            "detailedness": det_score
        }
    return scores

def select_best_extraction_type(answer, reference_summaries=None):
    """
    IMPROVED: Better selection using weighted scoring instead of simple deviation
    """
    scores = score_answer(answer, reference_summaries)

    extraction_type_scores = {}

    for adj_type, target_scores in EXTRACTION_TYPES_TEXT_SUMM_SCORES.items():
        # Calculate weighted similarity instead of simple deviation
        total_similarity = 0
        total_weight = 0

        for dimension, target_value in target_scores.items():
            if dimension in scores:
                actual_value = scores[dimension]
                # Use gaussian similarity instead of absolute deviation
                deviation = abs(target_value - actual_value)
                weight = target_value if target_value > 0 else 0.1  # Weight by target importance
                similarity = math.exp(-2.0 * deviation**2)  # Gaussian similarity
                total_similarity += weight * similarity
                total_weight += weight

        if total_weight > 0:
            extraction_type_scores[adj_type] = total_similarity / total_weight
        else:
            extraction_type_scores[adj_type] = 0

    # Return the type with highest similarity score
    best_type = max(extraction_type_scores.items(), key=lambda x: x[1])
    return best_type[0], extraction_type_scores

In [None]:
#TO CHECK IF IT WORKS
from torch.utils.data import DataLoader
import re


# Note - there are better ways to do this than what is shown here. This is just for illustration, as a simple way to extract the intended generation.
def extract_intended_generation(full_text): # You will most likely need to create variations of this method to extract your intended answer, depending on the output of your prompt.
    try:
      # The output includes the prompt as well. Split the text into segments based on 'A: ' which starts each answer
      segments = full_text.split('Response: ')
      # Take the last segment, which should be the most recent answer
      last_answer = segments[1].strip().split("Response: ")[0]

      # For some reason, the outputted answer includes part of 'Question 1' afterwards. This code splits the output before that occurs
      #last_answer = last_answer.split("Question 1:")[0].strip()

      return last_answer

    except ValueError as e:
      return None



def extract_output_numerical_answer(text, key_phrase="The answer is"): # You will most likely need to create variations of this function to extract your intended numerical answer, depending on the output of your prompt.
    if text is None:
        return None

    try:
        # Find the position where 'The answer is' starts in the text
        start_pos = text.index(key_phrase) + len(key_phrase)

        # Extract the text that comes after 'The answer is'
        answer_text = text[start_pos:].strip()

        # Use a regular expression to find a floating point or integer number
        matches = re.findall(r'[-+]?\d*\.\d+|\d+', answer_text)

        if matches:
            return float(matches[0])
        else:
            return None

    except ValueError:
        return None


def extract_true_numerical_answer(text):
    # Find the position of the "#### " pattern in the text
    position = text.find("#### ")

    # Extract the numerical part that follows this pattern
    if position != -1:
        numerical_str = text[position + 5:].strip()
        try:
            return float(numerical_str)
        except ValueError:
            print(f"Error: Could not convert '{numerical_str}' to float.")
            return None
    else:
        return None  # or some error handling

Note: All the functions for running each algorithm (toe and greedy) will clean the output INSIDE the function.

In [None]:
#Implemented Task-based Output Extraction(toe)
#to make it UNIVERSAL SELF CONSISTENCY, extraction_type = "consistent"
# No COT for text summ
def output_extraction(question, num_samples, responses, extraction_type):
    max_retries = 20
    c = 0

    while c <= max_retries:
        joined_responses = "\n".join([f"Response {i+1}:\n{resp}" for i, resp in enumerate(responses)])
        # remove COT: question.split("\n")[-2] ???
        selection_prompt = (
            f"I have generated the following {num_samples} responses to the question: {question.split("\n")[-2]}\n"
            f"Select the most {extraction_type} response based on majority consensus..\n"
            f"Respond ONLY with the response number.\n\n"
            f"{joined_responses}"
        )
        print(selection_prompt)

        response = client.chat.completions.create( 
            model=MODEL,
            messages=[
                    {
                    "role": "user",
                    "content": selection_prompt
                    }
            ],
            temperature=0.6
        )
        selection_text = response.choices[0].message.content
        selected_num = None
        try:
            selected_num = int(selection_text[-1].rstrip('.'))
        except ValueError as e:
            print("Error for: ", selection_text, "trying again...")
            print(e)
            c += 1
            continue
        final_answer = None
        if selected_num is not None and 1 <= selected_num <= len(responses):
            final_answer = responses[selected_num - 1]
        return final_answer

In [None]:
# Note: If you want to test the whole dataset, you can change test_dataloader to train_dataloader (usually contains more samples) and remove the indexing
def run_toe(question, num_samples, extraction_type=None, llm_judge=False): #inputs is an already tokenized question
  response = client.chat.completions.create( 
    model=MODEL,
    messages=[
        {
        "role": "user",
        "content": question
        }
    ],
    temperature=0.7,
    n=num_samples
  )

  #we sample on our own
  sampled_outputs = []
  for i in range(0, num_samples):
    sampled_outputs.append(response.choices[i].message.content)

  extraction_type_frequency = {}
  for i, output in enumerate(sampled_outputs):
    intended_output = extract_intended_generation(output) # extract_intended_generation(output), for some reason the extraction doesn't work? (The model isn't adding "Summary: " at the beginning, should be solved with better models)
    sampled_outputs[i] = intended_output

    # If we want automatic selection
    if extraction_type is None:
      extraction_types = {}

      # If it is an LLM judge, the best response will be the highest score, so use the lowest average deviation instead.
      if llm_judge:
        scores = score_answer(intended_output, llm_judge=llm_judge) # Returns {"conciseness": X, "detailedness": Y, "clarity": Z}
        for k, v in EXTRACTION_TYPES_TEXT_SUMM_SCORES.items():
          conciseness_dev = abs(v["conciseness"] - scores["conciseness"])
          detailedness_dev = abs(v["detailedness"] - scores["detailedness"])
          clarity_dev = abs(v["clarity"] - scores["clarity"])

          # Get average deviations
          total_dev = [conciseness_dev, detailedness_dev, clarity_dev]
          average_dev = sum(total_dev) / len(total_dev)
          extraction_types[k] = average_dev

        # Get the adjective with the least deviation
        least_dev = {k:v for k,v in extraction_types.items() if v == min(list(extraction_types.values()))}
        extract_type = list(least_dev.keys())[0]
        extraction_type_frequency[extract_type] = extraction_type_frequency.get(extract_type, 0) + 1
      # Otherwise, use selection through scoring functions
      else:
        extract_type = select_best_extraction_type(intended_output)[0]
        extraction_type_frequency[extract_type] = extraction_type_frequency.get(extract_type, 0) + 1

  # Get the first extraction type that has the highest frequency (ie. most responses are best summarized in this manner)
  if extraction_type is None:
    extraction_type = max(extraction_type_frequency, key=extraction_type_frequency.get)
    extracted_output = output_extraction(question, num_samples, sampled_outputs, extraction_type)
    return extracted_output, extraction_type
  else:
    extracted_output = output_extraction(question, num_samples, sampled_outputs, extraction_type)
  #output_extraction takes sampled outputs and return one extracted output based on an extraction type
  print("extraction type: ", extraction_type, " answer: ", extracted_output)
  
  return extracted_output

In [None]:
# Note: If you want to test the whole dataset, you can change test_dataloader to train_dataloader (usually contains more samples) and remove the indexing
def run_greedy(question):
  response = client.chat.completions.create( 
    model=MODEL,
    messages=[
        {
        "role": "user",
        "content": question
        }
    ],
    temperature=0,
    do_sample=False,
    top_p=0
  )
  print(response.choices[0].message.content)
  return extract_intended_generation(response.choices[0].message.content)

In [None]:
print(train_dataloader.dataset[15][summarization_label] + "\n")

In [None]:
import csv
import os
import json

def save_result_incrementally(result, file_path, extraction_type): # result is a python dictionary containing data that will become clearer in the next section.
    # with open(file_path, 'a', newline='') as file:
    with open(file_path, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Ensure all keys are present in the result dictionary before writing
        row_data = [result.get('question_num', ''), result.get('question', ''), result.get('true_answer', ''), \
                    result.get(f'{extraction_type}_answer', ''), result.get("extraction_type", ""), \
                    result.get('f1_score', ''), result.get('rogue1', ''), result.get('rogue2', ''), result.get('rogueLsum', '')]
        writer.writerow(row_data)


def csv_to_json(csv_file_path, json_file_path, extraction_type):
    if os.path.exists(csv_file_path):
      results = []
      with open(csv_file_path, mode='r', encoding='utf-8') as file:
          csv_reader = csv.reader(file)

          for row in csv_reader:
              # Ensure row has enough columns before accessing
              if len(row) > 3:
                  result = {
                      "question_num": row[0],
                      "question": row[1],
                      "true_answer": row[2],
                      f'{extraction_type}_summary': row[3]
                  }
                  results.append(result)

      with open(json_file_path, 'w') as json_file:
          json.dump(results, json_file, indent=4)


def load_results_from_json(json_file_path):
    results = []
    if os.path.exists(json_file_path):
      with open(json_file_path, 'r') as f:
          results = json.load(f)
    return results

task_type = 'FINAL_ZERO_SHOT_summarization_3_adj_only'
model_name = "llama3.3_70b"
dataset_name = "billsum"
base_dir = './results'

In [None]:
# Saves all text summarization metrics
def save_last_question_data(total_questions, summary_scores, file_path):
  with open(file_path, 'w') as file:
    file.write(f'{total_questions} {summary_scores["bertscore_f1"]} {summary_scores["rouge1"]} {summary_scores["rouge2"]} {summary_scores["rougeL"]} {summary_scores["rougeLsum"]}')

def last_question_stats(file_path):
  total_questions = 0
  f1_score = 0
  rouge1, rouge2, rougeL, rougeLsum = 0, 0, 0, 0
  if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    with open(file_path, 'r') as file:
      line = file.readline()
      print(line)
      total_questions, f1_score, rouge1, rouge2, rougeL, rougeLsum = map(float, line.strip().split())
  return total_questions, f1_score, rouge1, rouge2, rougeL, rougeLsum

In [None]:
def path_based_on_extraction_type(extraction_type, num_samples=None, flexible=False, llm_judge=False):
  if flexible:
    our_dir_name_base_filename = f'{model_name}_{dataset_name}_flexible'
  else:
    our_dir_name_base_filename = f'{model_name}_{dataset_name}_{extraction_type}' + 'judge' if llm_judge else f'{model_name}_{dataset_name}_{extraction_type}'

  if num_samples != None:
    our_dir_file_path = os.path.join(base_dir, task_type, str(num_samples), our_dir_name_base_filename)
  else:
    our_dir_file_path = os.path.join(base_dir, task_type, our_dir_name_base_filename)
  if not os.path.exists(our_dir_file_path):
    os.makedirs(our_dir_file_path)

  last_question_stats_base_filename = f'{extraction_type}_last_question_stats'
  last_question_stats_file_path = os.path.join(our_dir_file_path, last_question_stats_base_filename + '.txt')

  csv_base_filename = f'{extraction_type}' # Be organized in how you name it, don't just name it "temp1" or something like that. Here's a sample name.
  csv_file_path = os.path.join(our_dir_file_path, csv_base_filename + '.csv')

  json_base_filename = f'{extraction_type}'
  json_file_path = os.path.join(our_dir_file_path, json_base_filename + '.json')

  return last_question_stats_file_path, csv_file_path, json_file_path #in last question stats we have total number of questions and correct answers

In [None]:
import time

def experiment(extraction_type, num_of_questions, question_thresh=None, num_samples=None, llm_judge=False): #either greedy, sc, or a type of adjective for extraction_type (detailed, consistent)
    #loading path to data about 1. last question stats (how many total questions and correct answers) 2. csv file and 3. json file
    last_question_stats_file_path, csv_file_path, json_file_path = path_based_on_extraction_type(extraction_type, llm_judge=llm_judge, num_samples=num_samples)
    total_questions, f1_score, rouge1, rouge2, rougeL, rougeLsum = last_question_stats(last_question_stats_file_path) # ALL TOTAL SCORES
    total_questions = int(total_questions)
    results = [] #results that we have for greedy decoding

    # f1_scores = []
    # rouge1_scores = []
    # rouge2_scores = []
    # rougeL_scores = []
    # rougeLsum_scores = []

    csv_to_json(csv_file_path, json_file_path, extraction_type)
    results = load_results_from_json(json_file_path)
    true_answers = test_dataloader.dataset[total_questions:num_of_questions+total_questions][summarization_label]

    for i, question in enumerate(test_dataloader.dataset[total_questions:num_of_questions+total_questions][text_to_summarize_label]):
        # here we use few shot COT
        if question_thresh is not None and total_questions >= question_thresh:
          print("TRIGGERED")
          return
        
        time.sleep(2.5) # For rate limit
        extract_type = None
        appended_question = append_prompt(cot, question)

        if extraction_type == "greedy":
          output_summary = run_greedy(appended_question)
        elif extraction_type == "automatic":
          output_result = run_toe(appended_question, num_samples=num_samples, llm_judge=llm_judge)
          output_summary = output_result[0]  # Extract the summary string
          extract_type = output_result[1] # Extract the extraction type
        elif llm_judge is True:
          print("LM JUDGE")
          output_result = run_toe(appended_question, num_samples=num_samples)
          output_summary = output_result[0]  # Extract the summary string
          extract_type = output_result[1] # Extract the extraction type
        else:
          output_summary = run_toe(appended_question, num_samples=num_samples, extraction_type=extraction_type)

        summary_score = evaluate_summary(output_summary, true_answers[i]) # Use output_summary here
        print(f"CHOSEN RESPONSE: \n{output_summary}")
        result = {
            "question_num": total_questions,
            "question": question,
            "true_answer": true_answers[i],
            f"{extraction_type}_answer": output_summary,
            f"extraction_type": extract_type,
            "f1_score": summary_score["bertscore_f1"],
            "rogue1": summary_score["rouge1"],
            "rogue2": summary_score["rouge2"],
            "rogueL": summary_score["rougeL"],
            "rogueLsum": summary_score["rougeLsum"]
        }
        result = {k: v for k, v in result.items() if v != None}
        results.append(result)

        f1_score += summary_score["bertscore_f1"]
        rouge1 += summary_score["rouge1"]
        rouge2 += summary_score["rouge2"]
        rougeL += summary_score["rougeL"]
        rougeLsum += summary_score["rougeLsum"]

        summary_score["bertscore_f1"] = f1_score
        summary_score["rouge1"] = rouge1
        summary_score["rouge2"] = rouge2
        summary_score["rougeL"] = rougeL
        summary_score["rougeLsum"] = rougeLsum
        total_questions += 1

        save_result_incrementally(result, csv_file_path, extraction_type)
        save_last_question_data(total_questions, summary_score, last_question_stats_file_path)
        print(f"Running f1 score for {extraction_type}: {(f1_score/total_questions):.4f}")

    avg_score = f1_score/total_questions if total_questions > 0 else 0
    print(f"Average f1 score for {extraction_type}: {avg_score:.4f}")
    return results

In [None]:
num_of_questions = 5
max_questions = 1000
num_samples = [3, 5, 8]

print(f"""
      RUNNING EXPERIMENT WITH FOLLOWING PARAMETERS: \n
      NUMBER OF SAMPLES PER RESPONSE (USC AND RELATED): {num_samples}
      NUMBER OF QUESTIONS: {num_of_questions} \n
      ---------------------------TASK T YPE: SUMMARIZATION----------------------------------------
""")

print("BEGIN GREEDY")

retries_allowed_max = 5000
current_retries = 0

while current_retries < retries_allowed_max:
    try:
        for i in range(200):
            # experiment(num_of_questions=num_of_questions, question_thresh=max_questions, extraction_type="greedy")

            for n_samples in num_samples:
                print(f"BEGIN FOR k={n_samples}")
                print("BEGIN CONSISTENT")
                experiment(num_of_questions=num_of_questions, question_thresh=max_questions, extraction_type="consistent", num_samples=n_samples)

                extraction_types = ["detailed", "concise", "clear"]
                for extraction_type in extraction_types:
                    print(f"BEGIN {extraction_type}")
                    experiment(num_of_questions=num_of_questions, question_thresh=max_questions, extraction_type=extraction_type, num_samples=n_samples)

                print("BEGIN AUTOMATIC")
                experiment(num_of_questions=num_of_questions, question_thresh=max_questions, extraction_type="automatic", num_samples=n_samples, llm_judge=True)
                experiment(num_of_questions=num_of_questions, question_thresh=max_questions, extraction_type="automatic", num_samples=n_samples)
    except Exception as e:
        print(e)
        current_retries += 1


In [None]:
import pandas as pd

adj = ['concise', 'clear', 'detailed']
to_compare = 'concise'
current_n = None
base_name = "llama3.3_70b_billsum_"

d_final = {}

for root, _, _ in os.walk(fr'results/FINAL_scoringV2_summarization_3_adj_only'):
    if root[-1].isnumeric():
        print("AAAA")
        current_n = root[-1]
    if current_n is None:
        continue
    
    d_final[current_n] = {} if d_final.get(current_n) is None else d_final[current_n]
    
    for a in adj:
        best_score = 0
        for r, _, file in os.walk(os.path.join(root, base_name + a)):
            print(r)
            df = pd.read_csv(os.path.join(r, a + '.csv'), delimiter=',', engine="python", header=None)
            for i in range(len(df)):
                d = df.loc[i]
                if d_final[current_n].get(a) is None:
                    d_final[current_n][a] = []
                d_final[current_n][a].append([d.loc[3], float(d.loc[5])])

d = d_final['8']
adj.remove(to_compare)
print("im running!")
c = 0
for i in range(len(d[to_compare]) - 10):
    for a in adj:
        if d[a][i][1] > d[to_compare][i][1] + 0.02:
            
            extract_type = select_best_extraction_type(d[a][i][0])
            if extract_type[0] != a:
                print(f'{a}: {d[a][i][0]} vs {to_compare}: {d[to_compare][i][0]}')
                print(f"{extract_type[0]} VS TRUE TYPE: {a}")
                print(extract_type[1])
                c += 1

print(c)