## Requirements
- protobuf sentencepiece bitsandbytes

In [None]:
import pandas as pd
import json
# load the data in one single dataframe
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [None]:
def refactor(df):# Extract all utterances from the dialogues
    all_utterances = []

    for _, row in df.iterrows():
        # Each row has an 'utterances' field which is a list of utterance dictionaries
        utterances = row['utterances']

        # Add dialogue ID and topic to each utterance for reference
        for utterance in utterances:
            utterance['dialogue_id'] = row['id']
            utterance['topic'] = row['topic']
            all_utterances.append(utterance)

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_utterances)
    return df

In [None]:
df = load_data('../../../dailydialog/dialogues.json')

In [None]:
refactor_df = refactor(df)
refactor_df

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd

In [None]:
# 1) Scegli il checkpoint quantizzato 4‑bit (gguf/q4_0) su HF
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [None]:
# read content of file huggingface_token.txt
with open('huggingface_token.txt', 'r') as file:
    token = file.read().strip()

In [None]:
!export HUGGINGFACE_HUB_TOKEN=token

In [None]:
# 2) Carica tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [None]:
# 3) Configurazione 4‑bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
max_mem = {
    0: 5_300 * 1024**2,      # GPU 0
    "cpu": 60 * 1024**3,     # tutto ciò che non sta in GPU
}

# 4) Carica modello su GPU (device_map="auto" sposta layer su GPU fino a saturazione)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    max_memory=max_mem,
    trust_remote_code=True
)
model.eval()

In [None]:
# 5) Definisci la “definizione” da ripassare al modello:
prompt_prefix = """
You are helping create a synthetic dataset for a classification task.
The goal is to generate single-sentence utterances that reflect one specific thinking style based on Edward De Bono's Six Thinking Hats. Here is the definition for the [CAP] hat:
[DEFINITION]
Please generate 10 distinct, realistic, mixed length utterances that clearly follow this definition. Each utterance must reflect the corresponding thinking style.
Only output a JSON list of objects, each in the format:
{"utterance": "your sentence here", "hat": "[CAP]"}
Respond only with the json text. No explanations, no notes, no markdown. Only valid JSON.
Now generate 10 utterances for the [CAP] hat.
Your answer must start with the json list and nothing else.
"""

In [None]:
white_hat_definition = "Exchanging or providing plain informations. Things generally true, things that happened to someone. Not trying to convince anyone."
black_hat_definition = "Analysis of a situation. Answers the why of something that is not being done. Negative analysis, explaining weak points of a thing, logically."
red_hat_definition = "Emotions involved in the answer. The utterance is clearly stated due to emotion involved. Intuitions, feelings, gut reactions. No need for logical justification."
yellow_hat_definition = "Statements that highlight the positive sides (negatives can exists). Changes that offer benefits. Open up remote but highly desirable possibilities. Reflect ideas to believe in. Provide encouragement to take action. Express positive judgments."
green_hat_definition = "Proposing new point of views. New Ideas. Solutions to problems. Imagining new scenarios, going beyond what is known."

In [None]:
white_hat_prompt = prompt_prefix.replace("[CAP]", "White")
white_hat_prompt = white_hat_prompt.replace("[DEFINITION]", white_hat_definition)
black_hat_prompt = prompt_prefix.replace("[CAP]", "Black")
black_hat_prompt = black_hat_prompt.replace("[DEFINITION]", black_hat_definition)
red_hat_prompt = prompt_prefix.replace("[CAP]", "Red")
red_hat_prompt = red_hat_prompt.replace("[DEFINITION]", red_hat_definition)
yellow_hat_prompt = prompt_prefix.replace("[CAP]", "Yellow")
yellow_hat_prompt = yellow_hat_prompt.replace("[DEFINITION]", yellow_hat_definition)
green_hat_prompt = prompt_prefix.replace("[CAP]", "Green")
green_hat_prompt = green_hat_prompt.replace("[DEFINITION]", green_hat_definition)

In [None]:
import json

def remove_spaces_outside_quotes(text):
    result = []
    inside_string = False

    for char in text:
        if char == '"':
            inside_string = not inside_string
            result.append(char)
        elif not inside_string and char == ' ':
            continue  # ignora spazi fuori dalle virgolette
        else:
            result.append(char)

    return ''.join(result)

def generate_synthetic_examples(prompt, model, tokenizer, max_new_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0][ inputs["input_ids"].shape[-1]: ], skip_special_tokens=True).strip()
    # remove newlines from decoded
    decoded = decoded.replace("\n", " ").replace("\r", " ")
    # remove spaces except the rha spaces inside double quotes
    decoded = ' '.join(decoded.split())
    decoded = remove_spaces_outside_quotes(decoded)
    # print("decoded:", decoded)
    
    try:
        json_start = decoded.find("[{")
        json_end = decoded.rfind("}]") + 2
        json_text = decoded[json_start:json_end]
        return json.loads(json_text)
    except Exception as e:
        print("Errore nel parsing:", e)
        print("Output ricevuto:\n", decoded)
        return []

In [None]:
white_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(white_hat_prompt, model, tokenizer)
    white_examples.extend(examples)
    
white_examples = pd.DataFrame(white_examples)
white_examples['utterance'].duplicated().sum()

In [None]:
green_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(green_hat_prompt, model, tokenizer)
    green_examples.extend(examples)

green_examples = pd.DataFrame(green_examples)
green_examples['utterance'].duplicated().sum()

In [None]:
red_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(red_hat_prompt, model, tokenizer)
    red_examples.extend(examples)
red_examples = pd.DataFrame(red_examples)
red_examples['utterance'].duplicated().sum()

In [None]:
black_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(black_hat_prompt, model, tokenizer)
    black_examples.extend(examples)
black_examples = pd.DataFrame(black_examples)
black_examples['utterance'].duplicated().sum()

In [None]:
yellow_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(yellow_hat_prompt, model, tokenizer)
    yellow_examples.extend(examples)
yellow_examples = pd.DataFrame(yellow_examples)
yellow_examples['utterance'].duplicated().sum()

In [None]:
# remove duplicates from white_examples
white_examples = white_examples.drop_duplicates(subset='utterance')
# remove duplicates from yellow_examples
yellow_examples = yellow_examples.drop_duplicates(subset='utterance')
# remove duplicates from red_examples
red_examples = red_examples.drop_duplicates(subset='utterance')
# remove duplicates from black_examples
black_examples = black_examples.drop_duplicates(subset='utterance')
# remove duplicates from green_examples
green_examples = green_examples.drop_duplicates(subset='utterance')

# all_examples = white_examples + yellow_examples + red_examples + green_examples + black_examples

#remove duplicates from all_examples
all_examples = pd.concat([white_examples, yellow_examples, red_examples, green_examples, black_examples]).drop_duplicates(subset='utterance')

In [None]:
# reset index in all_examples
all_examples.reset_index(drop=True, inplace=True)

In [None]:
# write all_examples to a json file
all_examples.to_json("synthetic_hat_dataset.json", orient="records", force_ascii=False, indent=4)

# Read and split

In [None]:
import pandas as pd
# load the json file to pandas dataframe
synthetic_df = pd.read_json("synthetic_hat_dataset.json", orient="records")

In [None]:
hat_map = {
    "red": 0,
    "white": 1,
    "black": 2,
    "yellow": 3,
    "green": 4
}
# lowercase hat column
synthetic_df['hat'] = synthetic_df['hat'].str.lower()
#print unique values in hat column
print(synthetic_df['hat'].unique())
#print distribution of hat column
print(synthetic_df['hat'].value_counts())

In [None]:
# map hat column to hat_map
synthetic_df['hat'] = synthetic_df['hat'].map(hat_map)
print(synthetic_df['hat'].unique())
# print distribution of hat column which is a pandas Series
print(synthetic_df['hat'].value_counts())

In [None]:
# split in train test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(synthetic_df, test_size=0.2, random_state=42, stratify=synthetic_df['hat'])

In [5]:
import json
train_texts = set(d['utterance'] for d in json.load(open("/home/atlas/hlt/HLT/Models Fine Tuning/synthetic_train_dataset.json")))
test_texts = set(d['utterance'] for d in json.load(open("/home/atlas/hlt/HLT/Models Fine Tuning/synthetic_test_dataset.json")))

print(f"Overlap: {len(train_texts & test_texts)} / {len(test_texts)}")

Overlap: 0 / 888
