## Requirements
- protobuf sentencepiece bitsandbytes

In [1]:
import pandas as pd
import json
# load the data in one single dataframe
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [2]:
def refactor(df):# Extract all utterances from the dialogues
    all_utterances = []

    for _, row in df.iterrows():
        # Each row has an 'utterances' field which is a list of utterance dictionaries
        utterances = row['utterances']

        # Add dialogue ID and topic to each utterance for reference
        for utterance in utterances:
            utterance['dialogue_id'] = row['id']
            utterance['topic'] = row['topic']
            all_utterances.append(utterance)

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_utterances)
    return df

In [3]:
df = load_data('../../dailydialog/dialogues.json')

In [4]:
refactor_df = refactor(df)
refactor_df

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
0,0,The kitchen stinks .,disgust,directive,,0,Ordinary_Life
1,1,I'll throw out the garbage . __eou__,no_emotion,commissive,,0,Ordinary_Life
2,0,"So Dick , how about getting some coffee for to...",happiness,directive,,1,Ordinary_Life
3,1,Coffee ? I don ’ t honestly like that kind of ...,disgust,commissive,,1,Ordinary_Life
4,2,"Come on , you can at least try a little , besi...",no_emotion,directive,,1,Ordinary_Life
...,...,...,...,...,...,...,...
102963,10,"Well , thank you very much for all that inform...",no_emotion,directive,,13117,Finance
102964,11,Are you going to make an offer today ?,no_emotion,question,,13117,Finance
102965,12,Yes . My customer is in urgent need of the ste...,no_emotion,inform,,13117,Finance
102966,13,"Ok , I'll get this rate right away .",no_emotion,commissive,,13117,Finance


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1) Scegli il checkpoint quantizzato 4‑bit (gguf/q4_0) su HF
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [None]:
!export HUGGINGFACE_HUB_TOKEN=""

In [4]:
# 2) Carica tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [5]:
# 3) Configurazione 4‑bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [6]:
max_mem = {
    0: 5_300 * 1024**2,      # GPU 0
    "cpu": 60 * 1024**3,     # tutto ciò che non sta in GPU
}

# 4) Carica modello su GPU (device_map="auto" sposta layer su GPU fino a saturazione)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    max_memory=max_mem,
    trust_remote_code=True
)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [00:31<00:00, 10.51s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [7]:
# 5) Definisci la “definizione” da ripassare al modello:
prompt_prefix = """
You are helping create a synthetic dataset for a classification task.
The goal is to generate single-sentence utterances that reflect one specific thinking style based on Edward De Bono's Six Thinking Hats. Here is the definition for the [CAP] hat:
[DEFINITION]
Please generate 10 distinct, realistic, mixed length utterances that clearly follow this definition. Each utterance must reflect the corresponding thinking style.
Only output a JSON list of objects, each in the format:
{"utterance": "your sentence here", "hat": "[CAP]"}
Respond only with the json text. No explanations, no notes, no markdown. Only valid JSON.
Now generate 10 utterances for the [CAP] hat.
Your answer must start with the json list and nothing else.
"""

In [8]:
white_hat_definition = "Exchanging or providing plain informations. Things generally true, things that happened to someone. Not trying to convince anyone."
black_hat_definition = "Analysis of a situation. Answers the why of something that is not being done. Negative analysis, explaining weak points of a thing, logically."
red_hat_definition = "Emotions involved in the answer. The utterance is clearly stated due to emotion involved. Intuitions, feelings, gut reactions. No need for logical justification."
yellow_hat_definition = "Statements that highlight the positive sides (negatives can exists). Changes that offer benefits. Open up remote but highly desirable possibilities. Reflect ideas to believe in. Provide encouragement to take action. Express positive judgments."
green_hat_definition = "Proposing new point of views. New Ideas. Solutions to problems. Imagining new scenarios, going beyond what is known."

In [9]:
white_hat_prompt = prompt_prefix.replace("[CAP]", "White")
white_hat_prompt = white_hat_prompt.replace("[DEFINITION]", white_hat_definition)
black_hat_prompt = prompt_prefix.replace("[CAP]", "Black")
black_hat_prompt = black_hat_prompt.replace("[DEFINITION]", black_hat_definition)
red_hat_prompt = prompt_prefix.replace("[CAP]", "Red")
red_hat_prompt = red_hat_prompt.replace("[DEFINITION]", red_hat_definition)
yellow_hat_prompt = prompt_prefix.replace("[CAP]", "Yellow")
yellow_hat_prompt = yellow_hat_prompt.replace("[DEFINITION]", yellow_hat_definition)
green_hat_prompt = prompt_prefix.replace("[CAP]", "Green")
green_hat_prompt = green_hat_prompt.replace("[DEFINITION]", green_hat_definition)

In [10]:
import json

def remove_spaces_outside_quotes(text):
    result = []
    inside_string = False

    for char in text:
        if char == '"':
            inside_string = not inside_string
            result.append(char)
        elif not inside_string and char == ' ':
            continue  # ignora spazi fuori dalle virgolette
        else:
            result.append(char)

    return ''.join(result)

def generate_synthetic_examples(prompt, model, tokenizer, max_new_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0][ inputs["input_ids"].shape[-1]: ], skip_special_tokens=True).strip()
    # remove newlines from decoded
    decoded = decoded.replace("\n", " ").replace("\r", " ")
    # remove spaces except the rha spaces inside double quotes
    decoded = ' '.join(decoded.split())
    decoded = remove_spaces_outside_quotes(decoded)
    # print("decoded:", decoded)
    
    try:
        json_start = decoded.find("[{")
        json_end = decoded.rfind("}]") + 2
        json_text = decoded[json_start:json_end]
        return json.loads(json_text)
    except Exception as e:
        print("Errore nel parsing:", e)
        print("Output ricevuto:\n", decoded)
        return []

In [11]:
white_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(white_hat_prompt, model, tokenizer)
    white_examples.extend(examples)
    
white_examples = pd.DataFrame(white_examples)
white_examples['utterance'].duplicated().sum()

np.int64(216)

In [12]:
green_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(green_hat_prompt, model, tokenizer)
    green_examples.extend(examples)

green_examples = pd.DataFrame(green_examples)
green_examples['utterance'].duplicated().sum()

Errore nel parsing: Expecting ',' delimiter: line 1 column 1293 (char 1292)
Output ricevuto:
 [{"utterance":"What if we integrated AI capabilities into our current customer service system to provide faster and more accurate responses?","hat":"Green"},{"utterance":"Could we design a new product line that caters to both the environmentally conscious and the tech-savvy market?","hat":"Green"},{"utterance":"What if we implemented a loyalty program that rewards customers for recycling our products?","hat":"Green"},{"utterance":"How about a mobile app that allows users to customize their own eco-friendly home decor?","hat":"Green"},{"utterance":"What if we created a system that uses AI to predict and prevent equipment failures in our manufacturing plants?","hat":"Green"},{"utterance":"Could we develop a virtual reality training program for new hires in the tech industry?","hat":"Green"},{"utterance":"What if we designed a smart city infrastructure that optimizes traffic flow and reduces carb

np.int64(24)

In [13]:
red_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(red_hat_prompt, model, tokenizer)
    red_examples.extend(examples)
red_examples = pd.DataFrame(red_examples)
red_examples['utterance'].duplicated().sum()

np.int64(151)

In [14]:
black_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(black_hat_prompt, model, tokenizer)
    black_examples.extend(examples)
black_examples = pd.DataFrame(black_examples)
black_examples['utterance'].duplicated().sum()

np.int64(126)

In [15]:
yellow_examples = []
for _ in range(100):
    examples = generate_synthetic_examples(yellow_hat_prompt, model, tokenizer)
    yellow_examples.extend(examples)
yellow_examples = pd.DataFrame(yellow_examples)
yellow_examples['utterance'].duplicated().sum()

np.int64(34)

In [None]:
# remove duplicates from white_examples
white_examples = white_examples.drop_duplicates(subset='utterance')
# remove duplicates from yellow_examples
yellow_examples = yellow_examples.drop_duplicates(subset='utterance')
# remove duplicates from red_examples
red_examples = red_examples.drop_duplicates(subset='utterance')
# remove duplicates from black_examples
black_examples = black_examples.drop_duplicates(subset='utterance')
# remove duplicates from green_examples
green_examples = green_examples.drop_duplicates(subset='utterance')

# all_examples = white_examples + yellow_examples + red_examples + green_examples + black_examples

#remove duplicates from all_examples
all_examples = pd.concat([white_examples, yellow_examples, red_examples, green_examples, black_examples]).drop_duplicates(subset='utterance')

with open("synthetic_hat_dataset.jsonl", "w") as f:
    for item in all_examples:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [17]:
# reset index in all_examples
all_examples.reset_index(drop=True, inplace=True)

In [18]:
# write all_examples to a json file
all_examples.to_json("synthetic_hat_dataset.json", orient="records", force_ascii=False, indent=4)

In [3]:
import pandas as pd
# load the json file to pandas dataframe
synthetic_df = pd.read_json("synthetic_hat_dataset.json", orient="records")

In [4]:
# split in train test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(synthetic_df, test_size=0.2, random_state=42, stratify=synthetic_df['hat'])

In [5]:
# write train and test to json files
train_df.to_json("synthetic_train_dataset.json", orient="records", force_ascii=False, indent=4)
test_df.to_json("synthetic_test_dataset.json", orient="records", force_ascii=False, indent=4)