# PERSUADE 2.0 Dataset Notebook
## 2. Counterfactuals

Tests related to the PERSUADE 2.0 essay dataset.

### Load dataset

In [1]:
import pandas as pd
# load dataset
df = pd.read_csv('persuade/tagged_persuade.csv')
df

Unnamed: 0,essay_id_comp,full_text,holistic_essay_score,word_count,prompt_name,stance,formality,sentiment
0,423A1CA112E2,phones\r\n\r\nmodern humans today are always o...,3,378,Phones and driving,2.0,0,0.0
1,BC75783F96E3,this essay will explain if drivers should or s...,4,432,Phones and driving,1.0,0,0.0
2,74C8BC7417DE,driving while the use of cellular devices\r\n\...,2,179,Phones and driving,1.0,0,1.0
3,A8445CABFECE,phones & driving\r\n\r\ndrivers should not be ...,3,221,Phones and driving,1.0,1,1.0
4,6B4F7A0165B9,cell phone operation while driving\r\n\r\nthe ...,4,334,Phones and driving,1.0,0,1.0
...,...,...,...,...,...,...,...,...
25073,AFEC37C2D43F,there has been at least one point in everyone'...,5,547,Seeking multiple opinions,2.0,0,1.0
25074,D46BCB48440A,"when people ask for advice,they sometimes talk...",4,373,Seeking multiple opinions,0.0,1,0.0
25075,0FB0700DAF44,"during a group project, have you ever asked a ...",4,631,Seeking multiple opinions,0.0,0,0.0
25076,D72CB1C11673,making choices in life can be very difficult. ...,4,417,Seeking multiple opinions,0.0,0,0.0


### Classifiers

In [1]:
import requests

# convert results list to numeric labels
stance_mapping = {
    'PRO': 0,
    'CON': 1,
    'NEUTRAL': 2
}

# calls local Ollama API Mistral instance with a defined text prompt
def query_ollama(prompt, model, temperature=0):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model,
              "prompt": prompt,
              "stream": False,
              "options": {
                  "temperature": temperature
              }
        }
    )
    return response.json()["response"].strip()

# builds stance prompt
def get_stance_prompt(essay):
    return (
        "Stance classification is the task of determining the expressed or implied opinion, or stance, of a statement"
        " toward a certain, specified target. The following statements are social media posts expressing opinions about entities.\n"
        "Each statement can either be 'PRO' or 'CON' toward their associated entity.\n"
        "entity: Atheism\n"
        "statement: Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST\n"
        "stance: PRO\n"
        "entity: Feminist Movement\n"
        "statement: Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST\n"
        "stance: PRO\n"
        "entity: Christianity\n"
        "statement: AlharbiF I’ll bomb anything I can get my hands on, especially if THEY aren’t christian. #graham2016 #GOP #SemST\n"
        "stance: CON\n"
        "entity: Hillary Clinton\n"
        "statement: Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST\n"
        "stance: CON\n"
        "Analyze the following statement and determine its stance towards the entity.\n"
        "Respond with a single word: 'PRO' or 'CON'. Only return the stance as a single word, and no other text.'\n"
        f"statement:\n{essay}\n"
        "stance:"
    )

# standalone stance classifier
def stance_classifier(text):
    prompt = get_stance_prompt(text)
    response = query_ollama(prompt, model="mistral")

    response_upper = response.upper()
    if 'CON' in response_upper:
        return stance_mapping['CON']
    elif 'PRO' in response_upper:
        return stance_mapping['PRO']
    elif 'NEUTRAL' in response_upper:
        return stance_mapping['NEUTRAL']
    else:
        return None  # unclear



In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# load tokenizer and model weights
model_name = 's-nlp/deberta-large-formality-ranker'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

id2formality = {0: "formal", 1: "informal"}

# standalone formality classifier
def formality_classifier(text, threshold=0.50):
    # tokenization
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        return_token_type_ids=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    # get output
    output = model(**encoding)
    scores = output.logits.softmax(dim=1)[0]  # get first (and only) row

    # get score
    formality_score = {id2formality[idx]: score.item() for idx, score in enumerate(scores)}

    # apply a threshold
    if formality_score['formal'] > threshold:
        return 0  # formal
    elif formality_score['informal'] > threshold:
        return 1  # informal
    else:
        return None  # uncertain

In [5]:
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

# standalone sentiment classifier
def sentiment_classifier(text, threshold=0.90):
    # run classifier
    result = sentiment_analysis(text, truncation=True, max_length=512)[0]

    label = result['label']
    score = result['score']

    # apply a threshold
    if label == 'POSITIVE' and score > threshold:
        return 0  # positive
    elif label == 'NEGATIVE' and score > threshold:
        return 1  # negative
    else:
        return None  # uncertain

Device set to use cpu


### Generation

In [7]:
# dictionaries to be used in the prompt

RUBRIC = """
- SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue and demonstrates outstanding critical thinking, using clearly appropriate examples, reasons, and other evidence to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
- SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking, generally using appropriate examples, reasons, and other evidence to support its position; the essay is well organized and focused, demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics.
- SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking, using adequate examples, reasons, and other evidence to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
- SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
- SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking, providing inappropriate or insufficient examples, reasons, or other evidence to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
- SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.
"""

stance_opposite = {
    'PRO': 'CON',
    'CON': 'PRO',
}

formality_opposite = {
    'FORMAL': 'INFORMAL',
    'INFORMAL': 'FORMAL'
}

sentiment_opposite = {
    'POSITIVE': 'NEGATIVE',
    'NEGATIVE': 'POSITIVE'
}

attribute_context = {
    'stance': (
        "In a PRO stance, the author clearly supports the topic by providing strong reasons in favor of it, emphasizing benefits, positives, and supportive arguments.\n"
        "In a CON stance, the author clearly opposes the topic, focusing on drawbacks, risks, or negative consequences.\n"
        "Switching stance requires presenting the OPPOSITE arguments.\n"
    ),
    'formality': (
        "FORMAL writing is structured and objective. It uses academic language, avoids contractions (like 'don't' or 'can't'), and avoids casual expressions.\n"
        "It favors complex sentences, third-person point of view, and professional tone.\n"
        "INFORMAL writing is conversational, uses contractions and casual language, favors first-person or second-person, and may include slang or simple sentence structures.\n"
        "To change from FORMAL to INFORMAL, make the writing sound more like spoken language. Use really simple, repeated words and structures, expressions. Use contractions and slang terms. \n"
        "To change from INFORMAL to FORMAL, eliminate contractions, use more precise vocabulary, and avoid casual phrasing.\n"
    ),
    'sentiment': (
        "POSITIVE sentiment expresses approval, optimism, or positive feelings.\n"
        "NEGATIVE sentiment expresses disapproval, criticism, or negative feelings.\n"
    )
}

# function to build the counterfactual prompt to be fed to the LLM
def get_counterfactual_prompt(text, attribute, original_label, og_score):
    if attribute == 'stance':
        opposite_label = stance_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current stance is {original_label}. Change it to {opposite_label}"
    elif attribute == 'formality':
        opposite_label = formality_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current style is '{original_label}'. Change it to {opposite_label}."
    elif attribute == 'sentiment':
        opposite_label = sentiment_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current sentiment is '{original_label}'. Change it to {opposite_label}."
    else:
        instruction = "Unknown attribute."

    return (
        "You are a skilled editor. Your task is to rewrite the following essay carefully."
        f"Keep the structure, arguments, and topic the same, but {instruction}. "
        f"Context:\n{attribute_context[attribute]}\n\n"
        "Do not add new ideas or remove key points. Keep the length and organization similar.\n"
        "Keep the same level of language proficiency from the original essay, including misspellings, punctuation and capitalization.\n"
        f"This essay has a score of {og_score} points on a 6-point scale. Your rewriting must meet the exact criteria as the rubric details for a score of {og_score}. The marking rubric is defined as follows:\n{RUBRIC}\n\n"
        "Respond with the rewritten essay only.\n\n"
        f"Essay:\n{text}"
    )

# generates a counterfactual given text, attribute, original label and the essay score
def generate_counterfactual(text, attribute, original_label, og_score):
    prompt = get_counterfactual_prompt(text, attribute, original_label, og_score)
    response = query_ollama(prompt, model="gemma3:12b", temperature=0.7)
    return response

In [15]:
import random

# generates x amount of counterfactuals and validates them using the given attribute classifier.
# essays are discarded if they do not align with correct classification.
# uses stratified sampling by holistic_essay_score attribute
def collect_validated_flips(
    df,
    attribute: str,                   # 'stance', 'formality', or 'sentiment'
    from_class: int,                  # e.g., 0 (for PRO)
    classifier_fn=None,               # e.g., stance_classifier
    label_mapping=None,               # e.g., {'PRO': 0, 'CON': 1, 'NEUTRAL': 2}
    prefix="ST",                      # e.g., 'ST'
    target_count=100
):

    # reverse the label mapping
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    from_label = reverse_label_mapping[from_class]
    to_label = {
        'stance': stance_opposite,
        'formality': formality_opposite,
        'sentiment': sentiment_opposite
    }[attribute][from_label]
    to_class = label_mapping[to_label]

    successful_flips = []

    # stratified ssampling by score
    score_bins = {score: group.index.tolist() for score, group in df.groupby('holistic_essay_score')}
    bin_targets = {score: max(1, int(len(indices) / len(df) * target_count)) for score, indices in score_bins.items()}

    total_flips = 0 # count total flips
    total_attempts = 1
    for score, indices in score_bins.items():
        if total_flips >= target_count:
            print(f"Done. Total flips is {total_flips} out of {target_count}")
            break

        random.shuffle(indices)
        sample_count = 0

        for idx in indices:
            if sample_count >= bin_targets[score]:
                break

            row = df.loc[idx]
            if row[attribute] != from_class:
                continue

            original_text = row['full_text']
            essay_code_original = f"{idx}-O"
            og_score = row['holistic_essay_score']

            # generate counterfactual
            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label, og_score)
                print("Counterfactual generated.")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['holistic_essay_score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['holistic_essay_score']
                })

                sample_count += 1
                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

            if len(successful_flips) >= target_count * 2:
                break

    print("---------------------------------------")
    print(f"Generated {total_flips} counterfactuals with {total_attempts - 1} attempts.")

    # second pass to fill counterfactuals if target not reached
    if total_flips < target_count:
        print(f"\nSecond pass: filling {target_count - total_flips} missing flips...")

        remaining_indices = df[df[attribute] == from_class].index.tolist()
        random.shuffle(remaining_indices)

        for idx in remaining_indices:
            if total_flips >= target_count:
                break

            row = df.loc[idx]
            original_text = row['full_text']
            essay_code_original = f"{idx}-O"
            og_score = row['holistic_essay_score']

            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label, og_score)
                print("Counterfactual generated.")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['holistic_essay_score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['holistic_essay_score']
                })

                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

    print(f"Generated {total_flips} counterfactuals")

    return pd.DataFrame(successful_flips)

In [16]:
# generate PRO -> CON flipped stance counterfactuals

stance_pro_to_con_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=0,  # PRO
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_pro_to_con_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 6 -----
Generating counterfact

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,15984-O,i would try to do better in school so i would ...,stance,0,1
1,15984-CST,i would try to do worse in school so i would n...,stance,1,1
2,1972-O,cars have been producing smog and causing prob...,stance,0,1
3,1972-CST,cars have been producing smog and causing prob...,stance,1,1
4,10486-O,im going to tell you if the facial action codi...,stance,0,1
...,...,...,...,...,...
195,2854-CST,"do you think you could live without your car, ...",stance,1,3
196,6896-O,at every school there are extracurricular acti...,stance,0,4
197,6896-CST,at every school there are extracurricular acti...,stance,1,4
198,14704-O,"dear principal,\r\n\r\ni have heard around tha...",stance,0,4


In [17]:
# generate CON -> PRO flipped stance counterfactuals

stance_con_to_pro_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=1,  # CON
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_con_to_pro_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: CON
Flip Attempt: CON → PRO | Classified as: 1
X - Incorrectly generated!
----- Attempt: 6 -----
Generating counterfa

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,8603-O,studying venus is something that you would hav...,stance,1,1
1,8603-CST,studying venus is something that you absolutel...,stance,0,1
2,18824-O,the electoral college is a way of voting for p...,stance,1,1
3,18824-CST,the electoral college is a way of voting for p...,stance,0,1
4,19381-O,i think we should keep the electoral collage b...,stance,1,1
...,...,...,...,...,...
195,3623-CST,summer projects are a great way to keep studen...,stance,0,5
196,15235-O,"dear principal,\r\n\r\ni believe that having a...",stance,1,4
197,15235-CST,"dear principal,\n\ni believe that having a b a...",stance,0,4
198,994-O,phones and driving\r\n\r\nalthough cell phones...,stance,1,3


In [18]:
# generate POSITIVE -> NEGATIVE flipped sentiment counterfactuals

sentiment_positive_to_negative_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=0,  # POSITIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1},
    prefix="SE",
    target_count=100
)

sentiment_positive_to_negative_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 0
X - Incorrectly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
N

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,5756-O,luke bomberger was just a normal high school g...,sentiment,0,1
1,5756-CSE,luke bomberger was just a normal high school g...,sentiment,1,1
2,8842-O,because he said the sun is more colse from us ...,sentiment,0,1
3,8842-CSE,because he insisted the sun was more colse fro...,sentiment,1,1
4,11281-O,i'm fore the value of using this technology to...,sentiment,0,1
...,...,...,...,...,...
195,5892-CSE,luke is decidedly not courageous because he jo...,sentiment,1,1
196,16245-O,i personally believe that driverless cars are ...,sentiment,0,3
197,16245-CSE,i personally believe that driverless cars are ...,sentiment,1,3
198,16939-O,the further development of driverless cars sho...,sentiment,0,5


In [19]:
# generate NEGATIVE -> POSITIVE flipped sentiment counterfactuals

sentiment_negative_to_positive_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=1,  # NEGATIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1},
    prefix="SE",
    target_count=100
)

sentiment_negative_to_positive_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,1451-O,no one is evan going to read these things so t...,sentiment,1,1
1,1451-CSE,no one is evan going to read these things so t...,sentiment,0,1
2,23542-O,do you think student would benefit from being ...,sentiment,1,1
3,23542-CSE,do you think student would benefit from being ...,sentiment,0,1
4,8311-O,-\r\n\r\nvenus is an second planet for the sun...,sentiment,1,1
...,...,...,...,...,...
195,17151-CSE,driverless cars are coming to the roads and th...,sentiment,0,2
196,7663-O,"the article ""the challenge of exploring venus""...",sentiment,1,4
197,7663-CSE,"the article ""the challenge of exploring venus""...",sentiment,0,4
198,6606-O,have you ever done something you didn't want t...,sentiment,1,4


In [20]:
# generate FORMAL -> INFORMAL flipped formality counterfactuals

formality_formal_to_informal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=0,  # FORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1},
    prefix="FO",
    target_count=100
)

formality_formal_to_informal_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 0
X - Incorrectly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 0
X - Incorrectly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 1
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
F

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,1278-O,the selctions i just read contain one common s...,formality,0,1
1,1278-CFO,the stuff i just read all talks about one thin...,formality,1,1
2,12401-O,many people might think that the face on mars ...,formality,0,1
3,12401-CFO,a lot of folks think the face on Mars was made...,formality,1,1
4,9316-O,"in ""the challenge of exploring venus,"" the aut...",formality,0,1
...,...,...,...,...,...
195,4896-CFO,"in the story ""a cowboy who rode the waves"" it ...",formality,1,4
196,24883-O,why is it impirtant to have multiple people gi...,formality,0,3
197,24883-CFO,why is it impirtant to have multiple people gi...,formality,1,3
198,22572-O,do you think that students would benefit from ...,formality,0,3


In [21]:
# generate INFORMAL -> FORMAL flipped formality counterfactuals

formality_informal_to_formal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=1,  # INFORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1},
    prefix="FO",
    target_count=100
)

formality_informal_to_formal_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attemp

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,1859-O,"the passage ""in german suburb, life goes on wi...",formality,1,1
1,1859-CFO,"The passage ""in german suburb, life goes on wi...",formality,0,1
2,19152-O,to me i think we should change the election by...,formality,1,1
3,19152-CFO,"To me, I believe that we should alter the elec...",formality,0,1
4,8542-O,the challenge of exploring venus\r\n\r\non thi...,formality,1,1
...,...,...,...,...,...
195,18967-CFO,"candidate, what do you think when I say that w...",formality,0,1
196,19652-O,"dear senator of state,\r\n\r\ni think keeping ...",formality,1,1
197,19652-CFO,"Dear Senator of State,\n\nI believe that maint...",formality,0,1
198,9618-O,"you are smile, but you only 80 percent happy, ...",formality,1,3


In [22]:
import os

# save counterfactuals to its folder
os.makedirs("counterfactuals", exist_ok=True)

stance_pro_to_con_df.to_csv("counterfactuals/stance_pro_to_con.csv", index=False)
stance_con_to_pro_df.to_csv("counterfactuals/stance_con_to_pro.csv", index=False)

sentiment_positive_to_negative_df.to_csv("counterfactuals/sentiment_positive_to_negative.csv", index=False)
sentiment_negative_to_positive_df.to_csv("counterfactuals/sentiment_negative_to_positive.csv", index=False)

formality_formal_to_informal_df.to_csv("counterfactuals/formality_formal_to_informal.csv", index=False)
formality_informal_to_formal_df.to_csv("counterfactuals/formality_informal_to_formal.csv", index=False)