# Now that [`generate_dpo_dataset.py`](./generate_dpo_dataset.py) has been run, we create a train-ready dataset.

## Libraries

In [5]:
import json
import numpy as np
import re
from pathlib import Path
import pandas as pd
import re
import csv

## Global Variables

In [6]:
ROOT_DIR = Path("../..")
INPUT_FILE = ROOT_DIR / "data/TrainingDataset/data/dpo_candidates_cot6_soap_sft_overspl.jsonl"
OUTPUT_FILE = ROOT_DIR / "data/TrainingDataset/data/dpo_train_cot6_soap_sft_overspl.jsonl"

## Utils

In [43]:
def split_cot(text, tag="</think>"):
    parts = text.split(tag)
    return {'cot': tag.join(parts[:-1]) + tag if len(parts) > 1 else text,
     'conclusion': parts[-1].strip()}

In [44]:
def reject_and_accept(candidates, reference, length_tolerance=2):
    accepted = []
    hard_rejected = []
    rejected = []

    for cand in candidates:
        # On conclusion
        if cand['conclusion'] != reference['conclusion']:
            hard_rejected.append(cand)
            continue
        # On CoT length
        ref_cot_len = len(reference['cot'].split())
        cand_cot_len = len(cand['cot'].split())
        length_ratio = cand_cot_len / ref_cot_len if ref_cot_len > 0 else 0
        if length_ratio < 1/length_tolerance or length_ratio > length_tolerance:
            rejected.append(cand)
            continue
        # Accepted
        accepted.append(cand)
    return accepted, rejected, hard_rejected

In [45]:
def remove_first_thought(cot_text):
    pattern = "<think>\n"
    # Remove the pattern if it is the start of the string
    if cot_text.startswith(pattern):
        return cot_text[len(pattern):]
    return cot_text

In [46]:
def remove_after_end_token(cot_text):
    end_token = "<|im_end|>"
    # remove everything after the end token (but keep the token itself)
    if end_token in cot_text:
        return cot_text.split(end_token)[0].strip() + end_token
    return cot_text

## Load data

In [47]:
processed_data = []
    
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()

In [48]:
print(f"Total lines in input file: {len(lines)}")

Total lines in input file: 4296


In [49]:
dpo_final_dataset = []

for line in lines:
    data = json.loads(line)
    prompt = data["prompt"]
    
    reference_raw = remove_after_end_token(remove_first_thought(data["reference"]))
    reference = split_cot(reference_raw)
    
    candidates = [split_cot(remove_after_end_token(remove_first_thought(cand))) for cand in data["candidates"]]
    
    accepted, rejected, hard_rejected = reject_and_accept(candidates, reference, length_tolerance=2)

    # Try all options to pick chosen and rejected
    for chosen_option in accepted + [reference]:
        for rejected_option in hard_rejected + rejected:
            if chosen_option != rejected_option:
                dpo_final_dataset.append({
                    "prompt": prompt + "\n<think>\n",
                    "chosen": chosen_option['cot'] + "\n\n" + chosen_option['conclusion'],
                    "rejected": rejected_option['cot'] + "\n\n" + rejected_option['conclusion']
                })

# shuffle the final dataset
np.random.shuffle(dpo_final_dataset)

In [55]:
# save the final dataset
import csv


with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for item in dpo_final_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# Convert both input and output files to pandas DataFrame for easier inspection
input_data = []
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        input_data.append(json.loads(line))
input_df = pd.DataFrame(input_data)
input_df.to_csv(ROOT_DIR / "data/TrainingDataset/data/dpo_candidates_cot6_soap_sft_overspl.csv", sep=';',index=False,quoting=csv.QUOTE_ALL, encoding='utf-8')
output_df = pd.DataFrame(dpo_final_dataset)
output_df.to_csv(ROOT_DIR / "data/TrainingDataset/data/dpo_train_cot6_soap_sft_overspl.csv", sep=';',index=False,quoting=csv.QUOTE_ALL, encoding='utf-8')

In [51]:
dpo_final_dataset

[{'prompt': 'Message:\nAlexis Vastine ce poissard n’empêche\n:ouch:\n\nAnalyse:\n\n<think>\n',
  'chosen': 'Explication :\n- **Sujet du message** : Critique de Alexis Vastine.\n- **Sens probable** : L\'auteur exprime un mécontentement ou une désapprobation à l\'égard de Alexis Vastine, en utilisant le terme "poissard", qui peut indiquer quelqu\'un ayant des problèmes d\'ordre moral ou éthique. Le symbole ":ouch:" suggère une réaction choquée ou surprise.\n</think>\n\n<think>\nTons :\nTons perçus : \n\n1. **Critique (certitude élevée)** - Le terme « poissard » et la désapprobation manifeste suggèrent une opinion négative sur Alexis Vastine.\n2. **Choqué (certitude moyenne)** - L’utilisation de « :ouch: » indique une réaction émotionnelle forte face à la situation.\n3. **Informel (certitude élevée)** - La formulation décontractée laisse transparaître une proximité avec le public.\n\nDoutes : Le ton peut varier selon les interprétations personnelles ; parfois, il pourrait aussi être vu co

In [52]:
print(f"Total lines in output file: {len(dpo_final_dataset)}")

Total lines in output file: 3444


In [20]:
from datasets import load_dataset
import pandas as pd
import numpy as np

dpo_dataset = load_dataset("Naela00/ToxiFrench", "dpo")
df = pd.DataFrame(dpo_dataset['train'])

In [21]:
def extract_labels(example):
    return example[-13:-10]

In [26]:
df['labels'] = df['chosen'].apply(extract_labels)
# undersample to minority class
min_count = df['labels'].value_counts().min()
df_balanced = pd.concat([group.sample(min_count, random_state=42) for _, group in df.groupby('labels')]).sample(frac=1, random_state=42).reset_index(drop=True)

In [28]:
df_balanced['labels'].value_counts()

labels
oui    1082
non    1082
Name: count, dtype: int64

In [35]:
# save balanced dataset to csv
df_balanced[["prompt","chosen","rejected"]].to_csv(ROOT_DIR / "data/TrainingDataset/data/dpo_train_cot6_soap_sft_overspl_balanced.csv", sep=';',index=False,quoting=csv.QUOTE_ALL, encoding='utf-8')

## Augmentation of label prediction

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np

dpo_dataset = load_dataset("Naela00/ToxiFrench", "dpo")
df = pd.DataFrame(dpo_dataset['train'])

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def extract_labels(example):
    return example[-13:-10]

def oposite_label(label):
    if label == "oui":
        return "non"
    elif label == "non":
        return "oui"

def opposite_conclusion(example):
    label = extract_labels(example)
    opposite_label_str = oposite_label(label)
    return example[:-13] + opposite_label_str + example[-10:]

In [13]:
augmented_examples = []
for idx, row in df.iterrows():
    augmented_example = opposite_conclusion(row['chosen'])
    augmented_examples.append({
        "prompt": row['prompt'],
        "chosen": row['chosen'],
        "rejected": augmented_example
    })
df_augmented = pd.DataFrame(augmented_examples)
df_final = pd.concat([df, df_augmented]).sample(frac=1, random_state=42).reset_index(drop=True)
df_final.to_csv(ROOT_DIR / "data/TrainingDataset/data/dpo_train_cot6_soap_sft_overspl_augmented.csv", sep=';',index=False,quoting=csv.QUOTE_ALL, encoding='utf-8')