# CAES Dataset Notebook
## 2. Counterfactuals

Tests related to the CAES essay dataset.

### Load dataset

In [17]:
import pandas as pd
# load dataset
df = pd.read_csv('caes/tagged_caes.csv')
df

Unnamed: 0,essay_id,full_text,score,word_count,prompt_name,stance,formality,sentiment
0,0,hola carlos ! hace mucho que no te veo hombre ...,B1,212,Carta amigo,0.0,1.0,1
1,1,"hola querida sara , espero que estes muy bien ...",A2,175,Carta amigo,0.0,1.0,0
2,2,como estais ? ! hola amigos ! ? espero que tod...,A1,104,Postal vacaciones,2.0,1.0,0
3,3,"cuando llegué a el destino , le propuso que si...",A2,135,Postal vacaciones,0.0,1.0,0
4,4,¿ que tal ? un beso espero que todo va bien po...,A1,80,Familia,0.0,1.0,0
...,...,...,...,...,...,...,...,...
26648,26648,buenas tardes ! 29_de_mayo_de_2019 salamanca q...,B1,299,Carta amigo,0.0,1.0,0
26649,26649,tengo la seguridad de mí mismo para contener e...,B2,157,Solicitud admisión,0.0,,0
26650,26650,"distinguidos señor / señra : hola , gerente . ...",B1,178,Reclamación compañía aérea,0.0,1.0,1
26651,26651,le escribo esta carta porque me gustaría inmen...,B2,303,Solicitud admisión,0.0,0.0,2


### Classifiers

In [18]:
import requests

# convert results list to numeric labels
stance_mapping = {
    'PRO': 0,
    'CON': 1,
    'NEUTRAL': 2
}

# calls local Ollama API Mistral instance with a defined text prompt
def query_ollama(prompt, model, temperature=0):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model,
              "prompt": prompt,
              "stream": False,
              "options": {
                  "temperature": temperature
              }
        }
    )
    return response.json()["response"].strip()

# builds stance prompt
def get_stance_prompt(essay):
    return (
        "Stance classification is the task of determining the expressed or implied opinion, or stance, of a statement"
        " toward a certain, specified target. The following statements are social media posts expressing opinions about entities.\n"
        "Each statement can either be 'PRO' or 'CON' toward their associated entity.\n"
        "entity: Atheism\n"
        "statement: Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST\n"
        "stance: PRO\n"
        "entity: Feminist Movement\n"
        "statement: Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST\n"
        "stance: PRO\n"
        "entity: Christianity\n"
        "statement: AlharbiF I’ll bomb anything I can get my hands on, especially if THEY aren’t christian. #graham2016 #GOP #SemST\n"
        "stance: CON\n"
        "entity: Hillary Clinton\n"
        "statement: Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST\n"
        "stance: CON\n"
        "Analyze the following statement and determine its stance towards the entity.\n"
        "Respond with a single word: 'PRO' or 'CON'. Only return the stance as a single word, and no other text.'\n"
        f"statement:\n{essay}\n"
        "stance:"
    )

# standalone stance classifier
def stance_classifier(text):
    prompt = get_stance_prompt(text)
    response = query_ollama(prompt, model="mistral")

    response_upper = response.upper()
    if 'CON' in response_upper:
        return stance_mapping['CON']
    elif 'PRO' in response_upper:
        return stance_mapping['PRO']
    elif 'NEUTRAL' in response_upper:
        return stance_mapping['NEUTRAL']
    else:
        return None  # unclear

In [19]:
from transformers import pipeline

# load model
pipe = pipeline("text-classification", model="LenDigLearn/formality-classifier-mdeberta-v3-base")

id2formality = {"formal": 0, "informal": 1, "neutral": 2}

def formality_classifier(text, threshold=0.50):
    label, score = pipe(text)[0].values()
    label_idx = id2formality[label]

    if score > threshold:
        return label_idx
    else:
        return None

Device set to use cpu


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "clapAI/roberta-large-multilingual-sentiment"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.float16)

model.to(device)
model.eval()

# Retrieve labels from the model's configuration
id2label = model.config.id2label
id2sentiment = {"positive": 0, "negative": 1, "neutral": 2}

def sentiment_classifier(text, threshold=0.90):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    # Perform inference in inference mode
    with torch.inference_mode():
        outputs = model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)
    result_str = id2label[predictions.item()]
    return id2sentiment[result_str]


### Generation

In [33]:
# dictionaries to be used in the prompt

RUBRIC = """
Here is a high level description of the knowledge that students of each CEFR level are expected to have.
- A student of level pre-A1 can give basic personal information (e.g. name, address, nationality), perhaps with the use of a dictionary.
- A student of level A1 can give information about matters of personal relevance (e.g. likes and dislikes, family, pets) using simple words/signs and basic expressions; can produce simple isolated phrases and sentences.
- A student of level A2 can produce a series of simple phrases and sentences linked with simple connectors like “and”, “but” and “because”.
- A student of level B1 can produce straightforward connected texts on a range of familiar subjects within their field of interest, by linking a series of shorter discrete elements into a linear sequence.
- A student of level B2 can produce clear, detailed texts on a variety of subjects related to their field of interest, synthesising and evaluating information and arguments from a number of sources.
- A student of level C1 can produce clear, well-structured texts of complex subjects, underlining the relevant issues, expanding and supporting points of view at some length with subsidiary points, reasons and relevant examples, and rounding off with an appropriate conclusion; can employ the structure, vocabulary and register of genres, varying the tone, style and structure in an appropriate way for the target reader.
- A student of level C2 can write clear, smoothly flowing, complex texts in an appropriate and effective style and a logical structure which helps the reader identify significant points.
"""

stance_opposite = {
    'PRO': 'CON',
    'CON': 'PRO',
}

formality_opposite = {
    'FORMAL': 'INFORMAL',
    'INFORMAL': 'FORMAL'
}

sentiment_opposite = {
    'POSITIVE': 'NEGATIVE',
    'NEGATIVE': 'POSITIVE'
}

attribute_context = {
    'stance': (
        "In a PRO stance, the author clearly supports the topic by providing strong reasons in favor of it, emphasizing benefits, positives, and supportive arguments.\n"
        "In a CON stance, the author clearly opposes the topic, focusing on drawbacks, risks, or negative consequences.\n"
        "Switching stance requires presenting the OPPOSITE arguments.\n"
    ),
    'formality': (
        "FORMAL writing is structured and objective. It uses academic language, avoids contractions (like 'don't' or 'can't'), and avoids casual expressions.\n"
        "It favors complex sentences, third-person point of view, and professional tone.\n"
        "INFORMAL writing is conversational, uses contractions and casual language, favors first-person or second-person, and may include slang or simple sentence structures.\n"
        "To change from FORMAL to INFORMAL, make the writing sound more like spoken language. Use really simple, repeated words and structures, expressions. Use contractions and slang terms. \n"
        "To change from INFORMAL to FORMAL, eliminate contractions, use more precise vocabulary, and avoid casual phrasing.\n"
    ),
    'sentiment': (
        "POSITIVE sentiment expresses approval, optimism, or positive feelings.\n"
        "NEGATIVE sentiment expresses disapproval, criticism, or negative feelings.\n"
    )
}

# function to build the counterfactual prompt to be fed to the LLM
def get_counterfactual_prompt(text, attribute, original_label, og_score):
    if attribute == 'stance':
        opposite_label = stance_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current stance is {original_label}. Change it to {opposite_label}"
    elif attribute == 'formality':
        opposite_label = formality_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current style is '{original_label}'. Change it to {opposite_label}."
    elif attribute == 'sentiment':
        opposite_label = sentiment_opposite.get(original_label, "UNKNOWN")
        instruction = f"the current sentiment is '{original_label}'. Change it to {opposite_label}."
    else:
        instruction = "Unknown attribute."

    return (
        "You are a skilled editor. Your task is to rewrite the following essay carefully."
        f"Keep the structure, arguments, and topic the same, but {instruction}. "
        f"Context:\n{attribute_context[attribute]}\n\n"
        "Do not add new ideas or remove key points. Keep the length and organization similar.\n"
        "Keep the same level of language proficiency from the original essay, including misspellings, punctuation and capitalization.\n"
        f"This essay has a level of {og_score} according to the rubric description. Your rewriting must meet the exact criteria as the rubric details for a score of {og_score}. The marking rubric is defined as follows:\n{RUBRIC}\n\n"
        "Respond with the rewritten essay only. Write your answer in Spanish\n\n"
        f"Original essay:\n{text}\n"
        f"Rewritten essay:"
    )

# generates a counterfactual given text, attribute, original label and the essay score
def generate_counterfactual(text, attribute, original_label, og_score):
    prompt = get_counterfactual_prompt(text, attribute, original_label, og_score)
    response = query_ollama(prompt, model="gemma3:12b", temperature=0.7)
    return response

In [34]:
import random

# generates x amount of counterfactuals and validates them using the given attribute classifier.
# essays are discarded if they do not align with correct classification.
# uses stratified sampling by holistic_essay_score attribute
def collect_validated_flips(
    df,
    attribute: str,                   # 'stance', 'formality', or 'sentiment'
    from_class: int,                  # e.g., 0 (for PRO)
    classifier_fn=None,               # e.g., stance_classifier
    label_mapping=None,               # e.g., {'PRO': 0, 'CON': 1, 'NEUTRAL': 2}
    prefix="ST",                      # e.g., 'ST'
    target_count=100
):

    # reverse the label mapping
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    from_label = reverse_label_mapping[from_class]
    to_label = {
        'stance': stance_opposite,
        'formality': formality_opposite,
        'sentiment': sentiment_opposite
    }[attribute][from_label]
    to_class = label_mapping[to_label]

    successful_flips = []

    # stratified ssampling by score
    score_bins = {score: group.index.tolist() for score, group in df.groupby('score')}
    bin_targets = {score: max(1, int(len(indices) / len(df) * target_count)) for score, indices in score_bins.items()}

    total_flips = 0 # count total flips
    total_attempts = 1
    for score, indices in score_bins.items():
        if total_flips >= target_count:
            print(f"Done. Total flips is {total_flips} out of {target_count}")
            break

        random.shuffle(indices)
        sample_count = 0

        for idx in indices:
            if sample_count >= bin_targets[score]:
                break

            row = df.loc[idx]
            if row[attribute] != from_class:
                continue

            original_text = row['full_text']
            essay_code_original = f"{idx}-O"
            og_score = row['score']

            # generate counterfactual
            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label, og_score)
                print("Counterfactual generated.")
                #print(f"Original text:\n\n{original_text}\n\nFlipped text:\n\n{flipped_text}")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['score']
                })

                sample_count += 1
                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

            if len(successful_flips) >= target_count * 2:
                break

    print("---------------------------------------")
    print(f"Generated {total_flips} counterfactuals with {total_attempts - 1} attempts.")

    # second pass to fill counterfactuals if target not reached
    if total_flips < target_count:
        print(f"\nSecond pass: filling {target_count - total_flips} missing flips...")

        remaining_indices = df[df[attribute] == from_class].index.tolist()
        random.shuffle(remaining_indices)

        for idx in remaining_indices:
            if total_flips >= target_count:
                break

            row = df.loc[idx]
            original_text = row['full_text']
            essay_code_original = f"{idx}-O"
            og_score = row['score']

            print(f"----- Attempt: {total_attempts} -----")
            try:
                print(f"Generating counterfactual... Attribute: {attribute}. {from_label} -> {to_label}")
                flipped_text = generate_counterfactual(original_text, attribute, from_label, og_score)
                print("Counterfactual generated.")
                new_label = classifier_fn(flipped_text)
                print(f"New label: {reverse_label_mapping[new_label]}")
            except Exception as e:
                print(f"[!] Error at idx {idx}: {e}")
                continue

            print(f"Flip Attempt: {from_label} → {to_label} | Classified as: {new_label}")

            if new_label == to_class:
                print("✓ - Correctly generated!")
                essay_code_cf = f"{idx}-C{prefix}"

                successful_flips.append({
                    "essay_code": essay_code_original,
                    "full_text": original_text,
                    "attribute": attribute,
                    "label": from_class,
                    "score_og": row['score']
                })

                successful_flips.append({
                    "essay_code": essay_code_cf,
                    "full_text": flipped_text,
                    "attribute": attribute,
                    "label": to_class,
                    "score_og": row['score']
                })

                total_flips += 1
            else:
                print("X - Incorrectly generated!")
            total_attempts += 1

    print(f"Generated {total_flips} counterfactuals")

    return pd.DataFrame(successful_flips)

In [37]:
# generate PRO -> CON flipped stance counterfactuals

stance_pro_to_con_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=0,  # PRO
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_pro_to_con_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: PRO → CON | Classified as: 2
X - Incorrectly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: PRO → CON | Classified as: 2
X - Incorrectly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: stance. PRO -> CON
Counterfactual generated.
New label: CON
Flip Attempt: PRO → CON | Classified as: 1
✓ - Correctly generated!
----- Attempt: 6 -----
Generating

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,1364-O,"querido compañero : hola , me llamo elene , so...",stance,0,A1
1,1364-CST,"Querido compañero: Hola, me llamo Elene, soy u...",stance,1,A1
2,6849-O,voy a explicar+un_poco explicar+un_poco mas so...,stance,0,A1
3,6849-CST,"voy a explicar un poco, explicar un poco mas s...",stance,1,A1
4,11790-O,es es una persona estupendo ! después quiero i...,stance,0,A1
...,...,...,...,...,...
195,3559-CST,quiero presentar os mi hermana mayor. se llama...,stance,1,A2
196,11228-O,sr. coordenador de el postgrado de lengua y li...,stance,0,B2
197,11228-CST,Sr. Coordinador de el postgrado de lengua y li...,stance,1,B2
198,13708-O,"muy señores : me llamo chantal_tabanji , soy l...",stance,0,B2


In [38]:
# generate CON -> PRO flipped stance counterfactuals

stance_con_to_pro_df = collect_validated_flips(
    df=df,
    attribute="stance",
    from_class=1,  # CON
    classifier_fn=stance_classifier,
    label_mapping={"PRO": 0, "CON": 1, "NEUTRAL": 2},
    prefix="ST",
    target_count=100
)

stance_con_to_pro_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: CON → PRO | Classified as: 2
X - Incorrectly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: stance. CON -> PRO
Counterfactual generated.
New label: PRO
Flip Attempt: CON → PRO | Classified as: 0
✓ - Correctly generated!
----- Attempt: 6 -----
Generating count

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,20565-O,! ¿ qué tal ? voy te presentar mi familia . es...,stance,1,A1
1,20565-CST,! ¿ qué tal ? voy te presentar mi familia . es...,stance,0,A1
2,21010-O,en carnaval de ese año ( 2011 ) bruno mi amigo...,stance,1,A1
3,21010-CST,en carnaval de ese año ( 2011 ) bruno mi amigo...,stance,0,A1
4,5103-O,"janete , mi hermana más vieja es casada con in...",stance,1,A1
...,...,...,...,...,...
195,22257-CST,hola chicas! durante las ultimas vaciones fui ...,stance,0,A2
196,6476-O,fumar en lugares públicos creo en la libertad ...,stance,1,B2
197,6476-CST,Fumar en lugares públicos creo en la libertad ...,stance,0,B2
198,12801-O,18_de_noviembre_de_2011 desde la cuidad de san...,stance,1,C1


In [39]:
# generate POSITIVE -> NEGATIVE flipped sentiment counterfactuals

sentiment_positive_to_negative_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=0,  # POSITIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1, "NEUTRAL": 2},
    prefix="SE",
    target_count=100
)

sentiment_positive_to_negative_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 2
X - Incorrectly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
New label: NEGATIVE
Flip Attempt: POSITIVE → NEGATIVE | Classified as: 1
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. POSITIVE -> NEGATIVE
Counterfactual generated.
Ne

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,21844-O,compañeros de trabajo soy juan el nuevo direct...,sentiment,0,A1
1,21844-CSE,compañeros de trabajo soy juan el nuevo direct...,sentiment,1,A1
2,18327-O,¡ hola laura ! como estas ? soy haifa tu compa...,sentiment,0,A1
3,18327-CSE,¡ hola laura ! como estas ? soy haifa tu compa...,sentiment,1,A1
4,10233-O,! estoy muy contente con eso ! para que me con...,sentiment,0,A1
...,...,...,...,...,...
195,16550-CSE,salvador_dali nacio el siglo pasado en figuere...,sentiment,1,A2
196,18238-O,buenas tardes . me llamo sérgio_francisco y le...,sentiment,0,A2
197,18238-CSE,buenas tardes . me llamo sérgio_francisco y le...,sentiment,1,A2
198,16203-O,la última vez que vi una película fue hace una...,sentiment,0,C1


In [40]:
# generate NEGATIVE -> POSITIVE flipped sentiment counterfactuals

sentiment_negative_to_positive_df = collect_validated_flips(
    df=df,
    attribute="sentiment",
    from_class=1,  # NEGATIVE
    classifier_fn=sentiment_classifier,
    label_mapping={"POSITIVE": 0, "NEGATIVE": 1, "NEUTRAL": 2},
    prefix="SE",
    target_count=100
)

sentiment_negative_to_positive_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New label: POSITIVE
Flip Attempt: NEGATIVE → POSITIVE | Classified as: 0
✓ - Correctly generated!
----- Attempt: 5 -----
Generating counterfactual... Attribute: sentiment. NEGATIVE -> POSITIVE
Counterfactual generated.
New

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,26473-O,"hola , mi padres están muertos . bruno vive en...",sentiment,1,A1
1,26473-CSE,"Hola, mi padres están vivos y muy bien. Bruno ...",sentiment,0,A1
2,6-O,acudí a su compañía la dirección de la cual ví...,sentiment,1,A1
3,6-CSE,acudí a su compañía la cual ví en una página w...,sentiment,0,A1
4,22659-O,"caro , "" randômico "" ! , bueno ... mi padre , ...",sentiment,1,A1
...,...,...,...,...,...
195,17025-CSE,"yo adoro mi amiga, yuki. es de la prefectura d...",sentiment,0,A2
196,11419-O,"en los ultimos años , muchos paises han proheb...",sentiment,1,B2
197,11419-CSE,"en los últimos años, muchos países han promovi...",sentiment,0,B2
198,1611-O,"en_meados_de junio_de_2010 , unos amigos mío y...",sentiment,1,B1


In [41]:
# generate FORMAL -> INFORMAL flipped formality counterfactuals

formality_formal_to_informal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=0,  # FORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1, "NEUTRAL": 2},
    prefix="FO",
    target_count=100
)

formality_formal_to_informal_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 1
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 1
✓ - Correctly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: INFORMAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 1
✓ - Correctly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
[!] Error at idx 14619: None
----- Attempt: 4 -----
Generating counterfactual... Attribute: formality. FORMAL -> INFORMAL
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: FORMAL → INFORMAL | Classified as: 2
X - Incorrectly ge

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,2697-O,"hola , buenas tardes . mi nombre es daniela y ...",formality,0,A1
1,2697-CFO,"hola, buenas tardes. soy daniela y te escribo ...",formality,1,A1
2,9067-O,"estimada amiga , es con mucho gusto que te esc...",formality,0,A1
3,9067-CFO,"Querida amiga,\n\n¡Qué guay escribirte! Quiero...",formality,1,A1
4,1281-O,"¡ hola mi amigo ! que tal , ¿ bien ? pues , te...",formality,0,A1
...,...,...,...,...,...
195,10745-CFO,"Hola,\n\nSoy Luiz Ricardo, cliente de su compa...",formality,1,C1
196,6695-O,"estimado sr. , sra , contacto ustedes hoy_en_d...",formality,0,B1
197,6695-CFO,"querido señor/señora,\n\nles escribo hoy para ...",formality,1,B1
198,21494-O,estimado xxx : le escribo este correo después ...,formality,0,C1


In [43]:
# generate INFORMAL -> FORMAL flipped formality counterfactuals

formality_informal_to_formal_df = collect_validated_flips(
    df=df,
    attribute="formality",
    from_class=1,  # INFORMAL
    classifier_fn=formality_classifier,
    label_mapping={"FORMAL": 0, "INFORMAL": 1, "NEUTRAL": 2},
    prefix="FO",
    target_count=100
)

formality_informal_to_formal_df

----- Attempt: 1 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly generated!
----- Attempt: 2 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
[!] Error at idx 10205: None
----- Attempt: 2 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 2
X - Incorrectly generated!
----- Attempt: 3 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: NEUTRAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 2
X - Incorrectly generated!
----- Attempt: 4 -----
Generating counterfactual... Attribute: formality. INFORMAL -> FORMAL
Counterfactual generated.
New label: FORMAL
Flip Attempt: INFORMAL → FORMAL | Classified as: 0
✓ - Correctly gener

Unnamed: 0,essay_code,full_text,attribute,label,score_og
0,14460-O,"hola , me nombre es mirna yo soy casada y tego...",formality,1,A1
1,14460-CFO,"Hola, mi nombre es Mirna. Yo soy casada y teng...",formality,0,A1
2,25416-O,hola ! como estas ! yo entiendo de tu lettre q...,formality,1,A1
3,25416-CFO,Hola. ¿Cómo está usted? Entiendo de su carta q...,formality,0,A1
4,15452-O,"hola pedro , mi familia vive un_poco lejo ahor...",formality,1,A1
...,...,...,...,...,...
195,9216-CFO,"Estimada profesora Marcela,\n\nLe escribo con ...",formality,0,A2
196,10420-O,me gustaría que me diese las informaciones nec...,formality,1,B2
197,10420-CFO,Agradecería que me proporcionara la informació...,formality,0,B2
198,21969-O,lo que escribo ahora es acerca_de la persona d...,formality,1,A2


In [44]:
import os

# save counterfactuals to its folder
os.makedirs("counterfactuals/caes", exist_ok=True)

stance_pro_to_con_df.to_csv("counterfactuals/caes/stance_pro_to_con.csv", index=False)
stance_con_to_pro_df.to_csv("counterfactuals/caes/stance_con_to_pro.csv", index=False)

sentiment_positive_to_negative_df.to_csv("counterfactuals/caes/sentiment_positive_to_negative.csv", index=False)
sentiment_negative_to_positive_df.to_csv("counterfactuals/caes/sentiment_negative_to_positive.csv", index=False)

formality_formal_to_informal_df.to_csv("counterfactuals/caes/formality_formal_to_informal.csv", index=False)
formality_informal_to_formal_df.to_csv("counterfactuals/caes/formality_informal_to_formal.csv", index=False)