# PERSUADE 2.0 Dataset Notebook
## 2a. Counterfactuals FSL

Tests related to the PERSUADE 2.0 essay dataset.

### Load dataset

In [1]:
import pandas as pd
# load dataset
df = pd.read_csv('tagged_dataset.csv')
df

Unnamed: 0,essay_id_comp,full_text,holistic_essay_score,word_count,prompt_name,stance,formality,sentiment
0,423A1CA112E2,phones\n\nmodern humans today are always on th...,3,378,Phones and driving,2.0,0,0.0
1,BC75783F96E3,this essay will explain if drivers should or s...,4,432,Phones and driving,1.0,0,0.0
2,74C8BC7417DE,driving while the use of cellular devices\n\nt...,2,179,Phones and driving,1.0,0,1.0
3,A8445CABFECE,phones & driving\n\ndrivers should not be able...,3,221,Phones and driving,1.0,1,1.0
4,6B4F7A0165B9,cell phone operation while driving\n\nthe abil...,4,334,Phones and driving,1.0,0,1.0
...,...,...,...,...,...,...,...,...
25073,AFEC37C2D43F,there has been at least one point in everyone'...,5,547,Seeking multiple opinions,2.0,0,1.0
25074,D46BCB48440A,"when people ask for advice,they sometimes talk...",4,373,Seeking multiple opinions,0.0,1,0.0
25075,0FB0700DAF44,"during a group project, have you ever asked a ...",4,631,Seeking multiple opinions,0.0,0,0.0
25076,D72CB1C11673,making choices in life can be very difficult. ...,4,417,Seeking multiple opinions,0.0,0,0.0


### Classifiers

In [2]:
import requests

# convert results list to numeric labels
stance_mapping = {
    'PRO': 0,
    'CON': 1,
    'NEUTRAL': 2
}

# calls local Ollama API Mistral instance with a defined text prompt
def query_ollama(prompt, model, temperature=0):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model,
              "prompt": prompt,
              "stream": False,
              "options": {
                  "temperature": temperature
              }
        }
    )
    return response.json()["response"].strip()

def get_stance_prompt(essay):
    return (
        "Stance classification is the task of determining the expressed or implied opinion, or stance, of a statement"
        " toward a certain, specified target. The following statements are social media posts expressing opinions about entities.\n"
        "Each statement can either be 'PRO' or 'CON' toward their associated entity.\n"
        "entity: Atheism\n"
        "statement: Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST\n"
        "stance: PRO\n"
        "entity: Feminist Movement\n"
        "statement: Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST\n"
        "stance: PRO\n"
        "entity: Christianity\n"
        "statement: AlharbiF I’ll bomb anything I can get my hands on, especially if THEY aren’t christian. #graham2016 #GOP #SemST\n"
        "stance: CON\n"
        "entity: Hillary Clinton\n"
        "statement: Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST\n"
        "stance: CON\n"
        "Analyze the following statement and determine its stance towards the entity.\n"
        "Respond with a single word: 'PRO' or 'CON'. Only return the stance as a single word, and no other text.'\n"
        f"statement:\n{essay}\n"
        "stance:"
    )

# standalone stance classifier function
def stance_classifier(text):
    prompt = get_stance_prompt(text)
    response = query_ollama(prompt, model="mistral")

    response_upper = response.upper()
    if 'CON' in response_upper:
        return stance_mapping['CON']
    elif 'PRO' in response_upper:
        return stance_mapping['PRO']
    elif 'NEUTRAL' in response_upper:
        return stance_mapping['NEUTRAL']
    else:
        return None  # unclear



In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# load tokenizer and model weights
model_name = 's-nlp/deberta-large-formality-ranker'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

id2formality = {0: "formal", 1: "informal"}

def formality_classifier(text, threshold=0.50):
    # tokenization
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        return_token_type_ids=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    # get output
    output = model(**encoding)
    scores = output.logits.softmax(dim=1)[0]  # get first (and only) row

    # get score
    formality_score = {id2formality[idx]: score.item() for idx, score in enumerate(scores)}

    # apply a threshold
    if formality_score['formal'] > threshold:
        return 0  # formal
    elif formality_score['informal'] > threshold:
        return 1  # informal
    else:
        return None  # uncertain

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

def sentiment_classifier(text, threshold=0.90):
    # run classifier
    result = sentiment_analysis(text, truncation=True, max_length=512)[0]

    label = result['label']
    score = result['score']

    # apply a threshold
    if label == 'POSITIVE' and score > threshold:
        return 0  # positive
    elif label == 'NEGATIVE' and score > threshold:
        return 1  # negative
    else:
        return None  # uncertain

Device set to use mps:0


### Generation

In [5]:
RUBRIC = """
- SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue and demonstrates outstanding critical thinking, using clearly appropriate examples, reasons, and other evidence to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
- SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking, generally using appropriate examples, reasons, and other evidence to support its position; the essay is well organized and focused, demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics.
- SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking, using adequate examples, reasons, and other evidence to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
- SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
- SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking, providing inappropriate or insufficient examples, reasons, or other evidence to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
- SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.
"""

stance_opposite = {
    'PRO': 'CON',
    'CON': 'PRO',
}

formality_opposite = {
    'FORMAL': 'INFORMAL',
    'INFORMAL': 'FORMAL'
}

sentiment_opposite = {
    'POSITIVE': 'NEGATIVE',
    'NEGATIVE': 'POSITIVE'
}

attribute_context = {
    'stance': (
        "In a PRO stance, the author clearly supports the topic by providing strong reasons in favor of it, emphasizing benefits, positives, and supportive arguments.\n"
        "In a CON stance, the author clearly opposes the topic, focusing on drawbacks, risks, or negative consequences.\n"
        "Switching stance requires presenting the OPPOSITE arguments.\n"
    ),
    'formality': (
        "FORMAL writing is structured and objective. It uses academic language, avoids contractions (like 'don't' or 'can't'), and avoids casual expressions.\n"
        "It favors complex sentences, third-person point of view, and professional tone.\n"
        "INFORMAL writing is conversational, uses contractions and casual language, favors first-person or second-person, and may include slang or simple sentence structures.\n"
        "To change from FORMAL to INFORMAL, make the writing sound more like spoken language. Use really simple, repeated words and structures, expressions. Use contractions and slang terms. \n"
        "To change from INFORMAL to FORMAL, eliminate contractions, use more precise vocabulary, and avoid casual phrasing.\n"
    ),
    'sentiment': (
        "POSITIVE sentiment expresses approval, optimism, or positive feelings.\n"
        "NEGATIVE sentiment expresses disapproval, criticism, or negative feelings.\n"
    )
}

def get_counterfactual_prompt(text, attribute, original_label, og_score):
    if attribute == 'stance':
        opposite_label = stance_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current stance is {original_label}. Change it to {opposite_label}"
    elif attribute == 'formality':
        opposite_label = formality_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current style is '{original_label}'. Change it to {opposite_label}."
    elif attribute == 'sentiment':
        opposite_label = sentiment_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current sentiment is '{original_label}'. Change it to {opposite_label}."
    else:
        instruction = "Unknown attribute."

    return (
        "You are a skilled editor. Your task is to rewrite the following essay carefully."
        f"Keep the structure, arguments, and topic the same, but {instruction}. "
        f"Context:\n{attribute_context[attribute]}\n\n"
        "Do not add new ideas or remove key points. Keep the length and organization similar.\n"
        "Keep the same level of language proficiency from the original essay, including misspellings, punctuation and capitalization.\n"
        f"This essay has a score of {og_score} points on a 6-point scale. Your rewriting must meet the exact criteria as the rubric details for a score of {og_score}. The marking rubric is defined as follows:\n{RUBRIC}\n\n"
        "Respond with the rewritten essay only.\n\n"
        f"Essay:\n{text}"
    )

def generate_counterfactual(text, attribute, original_label, og_score):
    prompt = get_counterfactual_prompt(text, attribute, original_label, og_score)
    response = query_ollama(prompt, model="gemma3:12b", temperature=0.7)
    return response

In [6]:
import random

def collect_validated_flips(
    df,
    attribute: str,                   # 'stance', 'formality', or 'sentiment'
    from_class: int,                  # e.g., 0 (for PRO)
    classifier_fn=None,               # e.g., stance_classifier
    label_mapping=None,               # e.g., {'PRO': 0, 'CON': 1, 'NEUTRAL': 2}
    prefix="ST",                      # e.g., 'ST'
    target_count=100
):

    # reverse the label mapping
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    from_label = reverse_label_mapping[from_class]
    to_label = {
        'stance': stance_opposite,
        'formality': formality_opposite,
        'sentiment': sentiment_opposite
    }[attribute][from_label]
    to_class = label_mapping[to_label]

    successful_flips = []

    # stratified ssampling by score
    score_bins = {score: group.index.tolist() for score, group in df.groupby('holistic_essay_score')}
    bin_targets = {score: max(1, int(len(indices) / len(df) * target_count)) for score, indices in score_bins.items()}

    total_flips = 0 # count total flips
    total_attempts = 1
    for score, indices in score_bins.items():
        if total_flips >= target_count:
            print(f"Done. Total flips is {total_flips} out of {target_count}")
            break

        random.shuffle(indices)
        sample_count = 0

        for idx in indices:
            if sample_count >= bin_targets[score]:
                break

            row = df.loc[idx]
            if row[attribute] != from_class:
                continue

            original_text = row['full_text']
            essay_code_original = f"{idx}-O"
            og_score = row['holistic_essay_score']

            # generate counterfactual
            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label, og_score)
                print("Counterfactual generated.")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['holistic_essay_score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['holistic_essay_score']
                })

                sample_count += 1
                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

            if len(successful_flips) >= target_count * 2:
                break

    print("---------------------------------------")
    print(f"Generated {total_flips} counterfactuals with {total_attempts - 1} attempts.")

    # second pass to fill counterfactuals if target not reached
    if total_flips < target_count:
        print(f"\nSecond pass: filling {target_count - total_flips} missing flips...")

        remaining_indices = df[df[attribute] == from_class].index.tolist()
        random.shuffle(remaining_indices)

        for idx in remaining_indices:
            if total_flips >= target_count:
                break

            row = df.loc[idx]
            original_text = row['full_text']
            essay_code_original = f"{idx}-O"

            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label)
                print("Counterfactual generated.")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['holistic_essay_score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['holistic_essay_score']
                })

                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

    print(f"Generated {total_flips} counterfactuals")

    return pd.DataFrame(successful_flips)

In [None]:
stance_pro_to_con_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=0,  # PRO
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_pro_to_con_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. PRO -> CON


In [8]:
stance_con_to_pro_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=1,  # CON
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_con_to_pro_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: CON
Flip Attempt: CON → PRO | Classified as: 1
X - Incorrectly generated!
----- Attempt: 6 -----
Generating counterfa

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,20049-O,"dear senator,\n\ni was meaning to ask you can ...",stance,1,1
1,20049-CST,"dear senator,\n\ni was meaning to ask you plea...",stance,0,1
2,17483-O,"in the article ""driverless cars are coming"" th...",stance,1,1
3,17483-CST,"in the article ""driverless cars are coming"" th...",stance,0,1
4,9093-O,venus's reputation is challenging for humans t...,stance,1,1
...,...,...,...,...,...
195,10515-CST,"as technology advances, humans find themselves...",stance,0,4
196,11494-O,we were up is space circling mars snapping pic...,stance,1,2
197,11494-CST,we were up is space circling mars snapping pic...,stance,0,2
198,10117-O,how would you feel if we had a technlology to ...,stance,1,2


In [10]:
sentiment_positive_to_negative_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=0,  # POSITIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1},
    prefix="SE",
    target_count=100
)

sentiment_positive_to_negative_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 0
X - Incorrectly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
N

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,7899-O,the author supports his ideas very well becaus...,sentiment,0,1
1,7899-CSE,the author fails to adequately support his ide...,sentiment,1,1
2,10091-O,the ways emontions work is the way we are feel...,sentiment,0,1
3,10091-CSE,the ways emontions work is the way we are feel...,sentiment,1,1
4,5007-O,"""hello, i am luke bomberger, i have traveled a...",sentiment,0,1
...,...,...,...,...,...
195,23979-CSE,"quite often, we find ourselves needing advice ...",sentiment,1,4
196,20777-O,"dear principal, 02/18/11\n\ni believe that (po...",sentiment,0,3
197,20777-CSE,"dear principal, 02/18/11\n\ni feel that (polic...",sentiment,1,3
198,12583-O,the face on mars was not created by aliens. th...,sentiment,0,4


In [11]:
sentiment_negative_to_positive_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=1,  # NEGATIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1},
    prefix="SE",
    target_count=100
)

sentiment_negative_to_positive_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,7790-O,here's why the author thinks venus is a worthy...,sentiment,1,1
1,7790-CSE,Here's why the author thinks venus is a worthy...,sentiment,0,1
2,19550-O,"the electoral college is a process, not a plac...",sentiment,1,1
3,19550-CSE,"the electoral college is a process, not a plac...",sentiment,0,1
4,2263-O,"to the fellow citizens using cars, its very ea...",sentiment,1,1
...,...,...,...,...,...
195,22432-CSE,some schools offer distance learning as an opt...,sentiment,0,4
196,844-O,phones & driving\n\ntexting and driving is a m...,sentiment,1,4
197,844-CSE,phones & driving\n\ntexting and driving is a s...,sentiment,0,4
198,12672-O,i belive that the face on mars is just a natur...,sentiment,1,3


In [None]:
formality_formal_to_informal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=0,  # FORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1},
    prefix="FO",
    target_count=100
)

formality_formal_to_informal_df

In [None]:
formality_informal_to_formal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=1,  # INFORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1},
    prefix="FO",
    target_count=100
)

formality_informal_to_formal_df

In [13]:
import os

os.makedirs("counterfactuals", exist_ok=True)

stance_pro_to_con_df.to_csv("counterfactuals/stance_pro_to_con.csv", index=False)
stance_con_to_pro_df.to_csv("counterfactuals/stance_con_to_pro.csv", index=False)

sentiment_positive_to_negative_df.to_csv("counterfactuals/sentiment_positive_to_negative.csv", index=False)
sentiment_negative_to_positive_df.to_csv("counterfactuals/sentiment_negative_to_positive.csv", index=False)

### Essay evaluation

In [61]:
# load datasets

stance_pro_to_con_df = pd.read_csv('counterfactuals/stance_pro_to_con.csv')
stance_con_to_pro_df = pd.read_csv('counterfactuals/stance_con_to_pro.csv')

sentiment_positive_to_negative_df = pd.read_csv('counterfactuals/sentiment_positive_to_negative.csv')
sentiment_negative_to_positive_df = pd.read_csv('counterfactuals/sentiment_negative_to_positive.csv')

In [62]:
from sklearn.metrics import cohen_kappa_score

RUBRIC = """
- SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue and demonstrates outstanding critical thinking, using clearly appropriate examples, reasons, and other evidence to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
- SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking, generally using appropriate examples, reasons, and other evidence to support its position; the essay is well organized and focused, demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics.
- SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking, using adequate examples, reasons, and other evidence to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
- SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
- SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking, providing inappropriate or insufficient examples, reasons, or other evidence to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
- SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.
"""

FEW_SHOT_EVAL = """
Example essay 1 of score "4":\n
"phones and driving\n\nin this world in which we live in, cell phones are a growing market as well as cars. the fact that we depend on cell phones throught the course of our day for numerous reasons.\n\ndrivers should be able to use their cell phones while driving because its easier to operate a phone in your hand than a cell phone that is not in your hand. emergencies can occur while driving and you need to report the emergerncy while driving. cell phones does not cause as big of a distrachon that other things that is done while driving.\n\na lot of people are in an uproar about driving and using a cell phone and somehmes it is overrated. trying to operate a cell phone with one hand while maintaining driving has no danger to it. a phone that is mounted on the holder that begins to ring is harder to operate answering, dialing and switching calls.\n\nmost hmes the mounhng is not secure or has defected parts which can cause more of an issue than having it to your ear. most people have older cars and cannot a? ord to upgrade and dont have the speaker ophon, which usually allows everyone surrounding your car to know you business.\n\ni cannot say it enough, emergencies happen in a ?ash. driving and you see a accident that needs emergency alenhon you need to be able to use the cell phone to call it in. things happen so quickly now and it could be your phone call that saves their life. also if a person is lost and no gps signal but calls can be made a person should be able to dial someone by hand and get direchons out of harms way. emergency is a big deal in cell phone usage.\n\ncell phones do not cause as much as a distrachon than people make out to be. as a driver a person should always be aware of the road and be able to mulh-task when using a cell phone. most hmes its other things besides a cell phone that cause a distrachon but blames the cell phone as a scapegoat. there could be test on the driver liscence test that we should take to see if were able to drive and talk on the phone instead of just banning it.\n\ntalking on a cell phone and driving a car may be a distrachon for some but not all. it should be our call on if we are focused enough to drive while talking. although some people are said to have had bad behavior while talking and driving some are very responsible. texhng and driving is di? erent from talking and driving and that should be the boundary. each person should be accountable for their achons just as speeding."\n\n
Example Essay 2 of score "3":
"phones\n\nmodern humans today are always on their phone. they are always on their phone more than 5 hours a day no stop .all they do is text back and forward and just have group chats on social media. they even do it while driving. they are some really bad consequences when stuff happens when it comes to a phone. some certain areas in the united states ban phones from class rooms just because of it.\n\nwhen people have phones, they know about certain apps that they have .apps like facebook twitter instagram and snapchat. so like if a friend moves away and you want to be in contact you can still be in contact by posting videos or text messages. people always have different ways how to communicate with a phone. phones have changed due to our generation.\n\ndriving is one of the way how to get around. people always be on their phones while doing it. which can cause serious problems. that's why there's a thing that's called no texting while driving. that's a really important thing to remember. some people still do it because they think it's stupid. no matter what they do they still have to obey it because that's the only way how did he save.\n\nsometimes on the news there is either an accident or a suicide. it might involve someone not looking where they're going or tweet that someone sent. it either injury or death. if a mysterious number says i'm going to kill you and they know where you live but you don't know the person's contact ,it makes you puzzled and make you start to freak out. which can end up really badly.\n\nphones are fine to use and it's also the best way to come over help. if you go through a problem and you can't find help you ,always have a phone there with you. even though phones are used almost every day as long as you're safe it would come into use if you get into trouble. make sure you do not be like this phone while you're in the middle of driving. the news always updated when people do something stupid around that involves their phones. the safest way is the best way to stay safe."\n\n
Example Essay 3 of score "2":\n
"should people drive with their cellphone or not?\n\nwe all love our phones now a days and basically ever since they were invented. everyone has one or wants one or the newest latest one .ever since they were invented in 1700s people have taken the full advantage of the telephone presidents and war sargents used them to win battles and etc. now the phone has evolved from tube looking phones to brick phones to flip phones to iphones and to smartphones. with all the new technology and apps like facetime snapchat3 games music social media there's almost no way anyone could ever put their phones down. but according to statistics in 2013 about 3,154 people died in accidents 424,000 were injured. in 2013, 10% of all drivers ages 15 to 19 involved in fatal accidents were reported to be distracted at the time of the crash.\n\nso the question stands do we need to drive while on your cellphone? with all the crashes maybe we should drive without our phones but with the increments in technology we mind as well just use them cars are able to call people just like phones, so if you can call people in your car and be distracted anyways when your calling whom ever. there putting almost smartphones in the car so you can most defiantly be distracted with that. the fact that they are and already are making cars able to drive by themselves is anther reason that is very distracting just as in tulsa the electrically powered car has pilot mode so it drives itself. so many people have been distracted from the road and driving because the car can drive itself even though the car can drive itself you still need to pay attention to the road.\n\ni do understand that you don't need to be looking down when driving and i do understand that phones are distracting. so you picking up a cellphone and calling someone can be distracting. also like i said in another essay i did there is a responsibility factor to firstly own a car and secondly to even own a phone. it might seem like it is not a responsibility owning a phone but it is when you own a phone you have personal information in it. if you were lose your phone and have never put a password or security code on it you can lose your information to whoever has your phone could be a hacker could be a random thieve.\n\nso really my answer is that it really depends on your responsibility level if you can't stay off your phone and it's life then you need to let someone drive but if you can be responsible while driving then i'd say you should us your phone responsibly. cause if you drive irresponsibly you should have your phone."\n
"""

def build_eval_prompt(text):
    return (
        f"You are an essay rater specializing in the evaluation of essays written by students from 6th to 12th grade. "
        f"Read and evaluate the essay: \n\n{FEW_SHOT_EVAL}\n"
        f"Essay to score:\n{text}\n\n"
        f"Assign it a score from 1 to 6, in increments of 1, based on this rubric:\n\n{RUBRIC}\n\n"
        f"Your response should be only a numeric value representing the score you gave."
    )

# function to evaluate all essays from a df
def evaluate_essays(df, model="llama3:8b"):
    scores = []

    for idx, row in df.iterrows():
        prompt = build_eval_prompt(row['full_text'])
        score_og = row['score_og']

        try:
            print("----------------")
            print("evaluating essay...")
            response = query_ollama(prompt, model=model)
            score = int(response.strip())

            if 1 <= score <= 6:
                # score is right
                print(f"[{idx}] evaluated. score: {score}. score_og: {score_og}")
                scores.append(score)
            else:
                print(f"invalid score from LLM at idx {idx}: {response}")
                scores.append(None)
        except Exception as e:
            print(f"error scoring essay at idx {idx}: {e}")
            scores.append(None)

    df["score_llm"] = scores

    return df

# computes qwk value from df using original score and llm score
def compute_qwk(df):
    df_clean = df.dropna(subset=["score_og", "score_llm"])

    # extract originals and counterfactuals
    originals = df_clean[df_clean["essay_code"].str.endswith("-O")]
    cf = df_clean[df_clean["essay_code"].str.contains("-C")]

    qwk_original = cohen_kappa_score(originals["score_og"], originals["score_llm"], weights='quadratic')
    qwk_cf = cohen_kappa_score(cf["score_og"], cf["score_llm"], weights='quadratic')

    print(f"QWK (Originals): {qwk_original:.4f}")
    print(f"QWK (Counterfactuals): {qwk_cf:.4f}")

    return qwk_original, qwk_cf

In [63]:
stance_pro_to_con_scored_df = evaluate_essays(stance_pro_to_con_df, 'gemma3:12b')

stance_pro_to_con_scored_df

----------------
evaluating essay...
[0] evaluated. score: 1. score_og: 1
----------------
evaluating essay...
[1] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[2] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[3] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[4] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[5] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[6] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[7] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[8] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[9] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[10] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[11] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[12] evaluated. score: 4. score_og: 2
----------------
evaluating essay..

Unnamed: 0,essay_code,full_text,attribute,label,score_og,score_llm
0,5370-O,if the story was told in first person then it ...,stance,0,1,1
1,5370-CST,if the story was told in first person then it ...,stance,1,1,2
2,9250-O,venus is the closest planet to earth in terms ...,stance,0,1,4
3,9250-CST,venus is the closest planet to earth in terms ...,stance,1,1,4
4,9377-O,"using technology to read student,s emotional e...",stance,0,1,3
...,...,...,...,...,...,...
195,13779-CST,i think that students ought *not* to have ther...,stance,1,2,3
196,9024-O,"the author supports the idea ""that studying ve...",stance,0,4,5
197,9024-CST,"the author supports the idea ""that studying ve...",stance,1,4,4
198,7453-O,there has been a increase over the past few ye...,stance,0,4,4


In [64]:
stance_con_to_pro_scored_df = evaluate_essays(stance_con_to_pro_df, 'gemma3:12b')

stance_con_to_pro_scored_df

----------------
evaluating essay...
[0] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[1] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[2] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[3] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[4] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[5] evaluated. score: 5. score_og: 1
----------------
evaluating essay...
[6] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[7] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[8] evaluated. score: 2. score_og: 2
----------------
evaluating essay...
[9] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[10] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[11] evaluated. score: 5. score_og: 2
----------------
evaluating essay...
[12] evaluated. score: 3. score_og: 2
----------------
evaluating essay..

Unnamed: 0,essay_code,full_text,attribute,label,score_og,score_llm
0,20049-O,"dear senator,\n\ni was meaning to ask you can ...",stance,1,1,2
1,20049-CST,"dear senator,\n\ni was meaning to ask you plea...",stance,0,1,2
2,17483-O,"in the article ""driverless cars are coming"" th...",stance,1,1,4
3,17483-CST,"in the article ""driverless cars are coming"" th...",stance,0,1,4
4,9093-O,venus's reputation is challenging for humans t...,stance,1,1,4
...,...,...,...,...,...,...
195,10515-CST,"as technology advances, humans find themselves...",stance,0,4,6
196,11494-O,we were up is space circling mars snapping pic...,stance,1,2,3
197,11494-CST,we were up is space circling mars snapping pic...,stance,0,2,3
198,10117-O,how would you feel if we had a technlology to ...,stance,1,2,3


In [65]:
sentiment_positive_to_negative_scored_df = evaluate_essays(sentiment_positive_to_negative_df, 'gemma3:12b')

sentiment_positive_to_negative_scored_df

----------------
evaluating essay...
[0] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[1] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[2] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[3] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[4] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[5] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[6] evaluated. score: 5. score_og: 1
----------------
evaluating essay...
[7] evaluated. score: 5. score_og: 1
----------------
evaluating essay...
[8] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[9] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[10] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[11] evaluated. score: 2. score_og: 2
----------------
evaluating essay...
[12] evaluated. score: 2. score_og: 2
----------------
evaluating essay..

Unnamed: 0,essay_code,full_text,attribute,label,score_og,score_llm
0,7899-O,the author supports his ideas very well becaus...,sentiment,0,1,4
1,7899-CSE,the author fails to adequately support his ide...,sentiment,1,1,2
2,10091-O,the ways emontions work is the way we are feel...,sentiment,0,1,2
3,10091-CSE,the ways emontions work is the way we are feel...,sentiment,1,1,2
4,5007-O,"""hello, i am luke bomberger, i have traveled a...",sentiment,0,1,3
...,...,...,...,...,...,...
195,23979-CSE,"quite often, we find ourselves needing advice ...",sentiment,1,4,3
196,20777-O,"dear principal, 02/18/11\n\ni believe that (po...",sentiment,0,3,3
197,20777-CSE,"dear principal, 02/18/11\n\ni feel that (polic...",sentiment,1,3,2
198,12583-O,the face on mars was not created by aliens. th...,sentiment,0,4,4


In [66]:
sentiment_negative_to_positive_scored_df = evaluate_essays(sentiment_negative_to_positive_df, 'gemma3:12b')

sentiment_negative_to_positive_scored_df

----------------
evaluating essay...
[0] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[1] evaluated. score: 6. score_og: 1
----------------
evaluating essay...
[2] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[3] evaluated. score: 5. score_og: 1
----------------
evaluating essay...
[4] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[5] evaluated. score: 3. score_og: 1
----------------
evaluating essay...
[6] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[7] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[8] evaluated. score: 3. score_og: 2
----------------
evaluating essay...
[9] evaluated. score: 4. score_og: 2
----------------
evaluating essay...
[10] evaluated. score: 2. score_og: 2
----------------
evaluating essay...
[11] evaluated. score: 2. score_og: 2
----------------
evaluating essay...
[12] evaluated. score: 3. score_og: 2
----------------
evaluating essay..

Unnamed: 0,essay_code,full_text,attribute,label,score_og,score_llm
0,7790-O,here's why the author thinks venus is a worthy...,sentiment,1,1,3
1,7790-CSE,Here's why the author thinks venus is a worthy...,sentiment,0,1,6
2,19550-O,"the electoral college is a process, not a plac...",sentiment,1,1,4
3,19550-CSE,"the electoral college is a process, not a plac...",sentiment,0,1,5
4,2263-O,"to the fellow citizens using cars, its very ea...",sentiment,1,1,2
...,...,...,...,...,...,...
195,22432-CSE,some schools offer distance learning as an opt...,sentiment,0,4,5
196,844-O,phones & driving\n\ntexting and driving is a m...,sentiment,1,4,5
197,844-CSE,phones & driving\n\ntexting and driving is a s...,sentiment,0,4,5
198,12672-O,i belive that the face on mars is just a natur...,sentiment,1,3,4


In [67]:
compute_qwk(stance_pro_to_con_scored_df)
compute_qwk(stance_con_to_pro_scored_df)
compute_qwk(sentiment_positive_to_negative_scored_df)
compute_qwk(sentiment_negative_to_positive_scored_df)

QWK (Originals): 0.4567
QWK (Counterfactuals): 0.1803
QWK (Originals): 0.5401
QWK (Counterfactuals): 0.2589
QWK (Originals): 0.4192
QWK (Counterfactuals): 0.2876
QWK (Originals): 0.6216
QWK (Counterfactuals): 0.2501


(np.float64(0.6215966774342409), np.float64(0.25005356760231423))

In [71]:
# Filter for essays where the LLM score differs from the original by 2 or more
large_diff_df = scored_df[stance_pro_to_con_scored_df.apply(lambda row: abs(row['score_llm'] - row['score_og']) >= 2, axis=1)]

# Preview or export the results
print(f"Found {len(large_diff_df)} essays with a score difference of 2 or more.")
large_diff_df

Found 54 essays with a score difference of 2 or more.


Unnamed: 0,essay_code,full_text,attribute,label,score_og,llm_score,score_llm
0,5370-O,if the story was told in first person then it ...,stance,0,1,4,1
2,9250-O,venus is the closest planet to earth in terms ...,stance,0,1,4,4
3,9250-CST,venus is the closest planet to earth in terms ...,stance,1,1,4,4
4,9377-O,"using technology to read student,s emotional e...",stance,0,1,4,3
5,9377-CST,using technology to read student's emotional e...,stance,1,1,3,3
6,5801-O,at first he worked at a to part time job a ban...,stance,0,1,4,3
7,5801-CST,at first he worked at a to part time job a ban...,stance,1,1,4,3
8,1700-O,how would life be with less usage of cars? som...,stance,0,2,4,3
9,1700-CST,how would life be with less usage of cars? som...,stance,1,2,4,3
10,5697-O,the sea cowboy\n\nwhy would anyone want to be ...,stance,0,2,4,3


In [68]:
stance_pro_to_con_scored_llama_df = evaluate_essays(stance_pro_to_con_df)
stance_con_to_pro_scored_llama_df = evaluate_essays(stance_con_to_pro_df)
sentiment_positive_to_negative_scored_llama_df = evaluate_essays(sentiment_positive_to_negative_df)
sentiment_negative_to_positive_scored_llama_df = evaluate_essays(sentiment_negative_to_positive_df)

----------------
evaluating essay...
[0] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[1] evaluated. score: 2. score_og: 1
----------------
evaluating essay...
[2] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[3] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[4] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[5] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[6] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[7] evaluated. score: 4. score_og: 1
----------------
evaluating essay...
[8] evaluated. score: 4. score_og: 2
----------------
evaluating essay...
[9] evaluated. score: 4. score_og: 2
----------------
evaluating essay...
[10] evaluated. score: 4. score_og: 2
----------------
evaluating essay...
[11] evaluated. score: 4. score_og: 2
----------------
evaluating essay...
[12] evaluated. score: 4. score_og: 2
----------------
evaluating essay..

In [70]:
compute_qwk(stance_pro_to_con_scored_llama_df)
compute_qwk(stance_con_to_pro_scored_llama_df)
compute_qwk(sentiment_positive_to_negative_scored_llama_df)
compute_qwk(sentiment_negative_to_positive_scored_llama_df)

QWK (Originals): 0.0050
QWK (Counterfactuals): 0.0774
QWK (Originals): 0.0265
QWK (Counterfactuals): 0.0044
QWK (Originals): 0.1160
QWK (Counterfactuals): 0.1251
QWK (Originals): 0.1067
QWK (Counterfactuals): 0.0563


(np.float64(0.10673645880746463), np.float64(0.056280027453672))