## Requirements
- protobuf sentencepiece bitsandbytes

In [1]:
import pandas as pd
import json
# load the data in one single dataframe
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [2]:
def refactor(df):# Extract all utterances from the dialogues
    all_utterances = []

    for _, row in df.iterrows():
        # Each row has an 'utterances' field which is a list of utterance dictionaries
        utterances = row['utterances']

        # Add dialogue ID and topic to each utterance for reference
        for utterance in utterances:
            utterance['dialogue_id'] = row['id']
            utterance['topic'] = row['topic']
            all_utterances.append(utterance)

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_utterances)
    return df

In [3]:
df = load_data('../../dailydialog/dialogues.json')

In [4]:
refactor_df = refactor(df)
refactor_df

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
0,0,The kitchen stinks .,disgust,directive,,0,Ordinary_Life
1,1,I'll throw out the garbage . __eou__,no_emotion,commissive,,0,Ordinary_Life
2,0,"So Dick , how about getting some coffee for to...",happiness,directive,,1,Ordinary_Life
3,1,Coffee ? I don ’ t honestly like that kind of ...,disgust,commissive,,1,Ordinary_Life
4,2,"Come on , you can at least try a little , besi...",no_emotion,directive,,1,Ordinary_Life
...,...,...,...,...,...,...,...
102963,10,"Well , thank you very much for all that inform...",no_emotion,directive,,13117,Finance
102964,11,Are you going to make an offer today ?,no_emotion,question,,13117,Finance
102965,12,Yes . My customer is in urgent need of the ste...,no_emotion,inform,,13117,Finance
102966,13,"Ok , I'll get this rate right away .",no_emotion,commissive,,13117,Finance


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 1) Scegli il checkpoint quantizzato 4‑bit (gguf/q4_0) su HF
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [None]:
!export HUGGINGFACE_HUB_TOKEN=""

In [8]:
# 2) Carica tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [9]:
# 3) Configurazione 4‑bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [10]:
max_mem = {
    0: 5_300 * 1024**2,      # GPU 0
    "cpu": 60 * 1024**3,     # tutto ciò che non sta in GPU
}

# 4) Carica modello su GPU (device_map="auto" sposta layer su GPU fino a saturazione)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    max_memory=max_mem,
    trust_remote_code=True
)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [00:25<00:00,  8.35s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [11]:
# 5) Definisci la “definizione” da ripassare al modello:
prompt_prefix = """
Definition: A green hat is the hat of creativity. 
Under the green hat you are permitted to put forward 'possibilities'. 
It is under the green hat that suggested courses of action are put forward: 'We could do this, or this, or this.' 
The green hat includes both 'the top of the head' creativity and 'deliberate' creativity. 
New ideas, new concepts and new perceptions. The deliberate creation of new ideas. 
Alternatives and more alternatives. Change. New approaches to problems. 
Label the next utternace as a green hat or not.
Answer only with "Y" for yes or "N" for No.
"""

In [12]:
# 6) Funzione di classification
def is_green_hat(phrases):
    results = []
    for text in phrases:
        prompt = str(prompt_prefix) + f'\nUtterance: "{text}"\nAnswer:'
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=1,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id
            )
        reply = tokenizer.decode(out[0][ inputs["input_ids"].shape[-1]: ], skip_special_tokens=True).strip()
        # print("reply: "+reply)
        results.append(reply)
    return results

In [65]:
# 7) Esempio d’uso:
test_sentences = [
    "Shoppers usually pay for the goods they buy. Let us reverse that. Po, the store pays the customers",
    "Po cars should have square wheels",
    "Po planes should land upside down",
    "Po shoppers should be paid to buy things",
    "Po executives should promote themselves",
    "Po a polluting factory should be downstream of itself",
    "Shoppers usually pay for the goods they buy. Let us reverse that. Po, the store pays the customers",
    "This could lead to the trading stamp idea, which, in effect, paid shoppers a tiny amount for each purchase",
    "This could lead to the idea that the tills are set up so that at every thousand dollars of input they pay out a jackpot of some sort",
    "I do not see how your idea of an ‘honour system’ store could ever work because it could so easily be abused. But I am going to put on my  hat to treat it as a provocation. That leads to the idea of people adding up their own bills with random checks. Presumably mistakes would even out in each direction",
    "So we have cigarette po frog. A frog suggests hopping, so we could have a cigarette that went out after a short while. This might be of benefit in preventing fires. It could also allow a smoker to have a short smoke and then to use that cigarette later. This in turn leads to a new brand to be called  shorts , which are indeed designed to be very short and give only a two- to three- minute smoke",
    "I want some ideas to do with television sets. The random word is cheese, so television po cheese. Cheese has holes, Po the TV screen has holes",
    "Our rival newspaper has just raised its price. Put on your  hat and list all our alternatives",
    "There are only three possible alternatives. We can leave the price the same. We can lower it. Or, we can raise it. There is nothing else we can do",
    "What I really want to do is both to raise and lower the price at the same time. We shall create a low price commodity line and a high price premium line"
]

print(is_green_hat(test_sentences))

['Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y']


In [66]:
test_sentences = [
    "My feeling is that boredom is responsible for much juvenile crime",
    "My feeling is that the cinema box office wants a few spectaculars that are heavily promoted",
    "I feel we are being bullied into an agreement we do not want",
    "You never listen to anyone else",
    "I am very pleased with the way this conference is going. Is that the general view",
    "My feeling is that we all want to get this agreement settled and signed",
    "Don’t look at it as a defeat. Look at it as a powerful way of finding out the weaknesses and strengths of his tennis game",
    "Would this offer be acceptable if it were to come as an initiative from your side",
    "Write it off as an essential learning experience rather than an error in judgement. Learning is always expensive. We won’t have to go through it again",
    "We all know that these negotiations are taking place against a background of extreme suspicion. Let us try to imagine what our thinking would be if each side really trusted the other side",
    "There is a feeling that what we decide here is not going to make much difference. Events have taken over. Let us imagine that this is not so and that we do have it in our power to control things",
    "We do have to be conscious of the background of anger that is present",
    "The proposed restriction on your work for competing companies is obviously a sensitive point. We’ll keep clear of that for the moment",
    "The union executive is never going to agree to anything that comes across as a wage cut. That has been expressed forcibly enough",
    "The ability to cross union demarcation lines is very important to our productivity",
    "We must insist that the proper disciplinary procedures be followed. We are not saying that Jones is innocent but the procedures laid down must be followed",
    "I do not feel that lowering prices will actually increase sales"
]

print(is_green_hat(test_sentences))

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N']


In [None]:
# skip the first 37000 lines
df = refactor_df.iloc[89300:]
# reset the index
df = df.reset_index(drop=True)
df

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
0,0,"Hi , Sven , can you help me ? Could you hold t...",no_emotion,directive,,10849,Work
1,1,"Of course , where do you want me to stand ?",no_emotion,question,,10849,Work
2,2,Just there by the door . I need to measure the...,no_emotion,inform,,10849,Work
3,3,I'm glad you're replacing it . We need more sp...,no_emotion,question,,10849,Work
4,4,Tell me what size you want and I'll order the ...,no_emotion,directive,,10849,Work
...,...,...,...,...,...,...,...
18263,10,"Well , thank you very much for all that inform...",no_emotion,directive,,13117,Finance
18264,11,Are you going to make an offer today ?,no_emotion,question,,13117,Finance
18265,12,Yes . My customer is in urgent need of the ste...,no_emotion,inform,,13117,Finance
18266,13,"Ok , I'll get this rate right away .",no_emotion,commissive,,13117,Finance


In [14]:
import os
import pandas as pd
from tqdm.auto import tqdm

# 1) Carica il DataFrame (già fatto in precedenza)
# df = pd.read_csv("il_tuo_file.csv")  

# Se non esiste ancora la colonna 'hat', la creiamo vuota
if 'hat' not in df.columns:
    df['hat'] = None

# 2) File di output per le predizioni in append
out_file = "hat_preds.csv"
# Se è il primo giro, scrivo l'header con tutte le colonne che mi servono
if not os.path.exists(out_file):
    pd.DataFrame(
        columns=['turn','dialogue_id','utterance','hat']
    ).to_csv(out_file, index=False)


# 3) Iterazione riga‑per‑riga
for idx in tqdm(range(len(df))):
    text = df.at[idx, 'utterance'].replace("__eou__", "")
    pred = is_green_hat([text])[0]
    df.at[idx, 'hat'] = pred

    # Ogni 100 righe faccio append al file
    if (idx + 1) % 100 == 0:
        chunk = df.loc[idx-99:idx, ['turn','dialogue_id','utterance','hat']]
        chunk.to_csv(out_file, mode='a', header=False, index=False)

# 4) Alla fine salvo eventuale “coda” residua
last_mod = len(df) % 100
if last_mod > 0:
    chunk = df.loc[len(df)-last_mod:len(df)-1, ['turn','dialogue_id','utterance','hat']]
    chunk.to_csv(out_file, mode='a', header=False, index=False)

# 5) Se vuoi mantenere tutto il df aggiornato anche in un unico file completo:
df.to_csv("df_with_hat.csv", index=False)


 25%|██▌       | 4603/18268 [22:06<1:05:37,  3.47it/s]


KeyboardInterrupt: 

In [74]:
print("Righe già etichettate:", df['hat'].notna().sum(),
      "di", len(df))


Righe già etichettate: 102968 di 102968


In [75]:
df

Unnamed: 0,turn,utterance,emotion,act,hat,dialogue_id,topic
0,0,The kitchen stinks .,disgust,directive,,0,Ordinary_Life
1,1,I'll throw out the garbage . __eou__,no_emotion,commissive,,0,Ordinary_Life
2,0,"So Dick , how about getting some coffee for to...",happiness,directive,,1,Ordinary_Life
3,1,Coffee ? I don ’ t honestly like that kind of ...,disgust,commissive,,1,Ordinary_Life
4,2,"Come on , you can at least try a little , besi...",no_emotion,directive,,1,Ordinary_Life
...,...,...,...,...,...,...,...
102963,10,"Well , thank you very much for all that inform...",no_emotion,directive,,13117,Finance
102964,11,Are you going to make an offer today ?,no_emotion,question,,13117,Finance
102965,12,Yes . My customer is in urgent need of the ste...,no_emotion,inform,,13117,Finance
102966,13,"Ok , I'll get this rate right away .",no_emotion,commissive,,13117,Finance
