#**Step 1: Install All the Required Packages**

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━

#**Step 2: Import All the Required Libraries**

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
#Use a sharded model to fine-tune in the free version of Google Colab.
base_model = "mistralai/Mistral-7B-v0.1" #bn22/Mistral-7B-Instruct-v0.1-sharded

- Free Google Colab offers a 15GB Graphics Card (Limited Resources --> Barely enough to store Llama 2–7b’s weights)

- We also need to consider the overhead due to optimizer states, gradients, and forward activations

- Full fine-tuning is not possible here: we need parameter-efficient fine-tuning (PEFT) techniques like LoRA or QLoRA.

- To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision, which is why we’ll use QLoRA here.

**Access token**: login in huggingface

In [None]:
from huggingface_hub import login
token = "your_access_token"
login(token=token)

#**Step 3**: Preprocessing the Dataset

In [15]:
gen = "Young"

train_dataset = f"data/GenerationAggregated/{gen}_train_set.csv"
test_dataset = f"data/GenerationAggregated/{gen}_test_set.csv"
val_dataset = f"data/GenerationAggregated/{gen}_test_set.csv"

#PIER: Young
# train_dataset = f"/content/drive/MyDrive/dataset/{gen}_train_set.csv"
# test_dataset = f"/content/drive/MyDrive/dataset/{gen}_test_set.csv"

#MAREM: Old
#train_dataset = f"/content/drive/MyDrive/Progetti/INLG_preliminary_tests/{gen}_train_set.csv"
#test_dataset = f"/content/drive/MyDrive/Progetti/INLG_preliminary_tests/{gen}_test_set.csv"

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
with open(f"{train_dataset}") as f:
  lines = f.readlines()
  for i, line in enumerate(lines):
    print(line)
    if i > 3:
      break

id_original,text,parent_text,label,source

fj5br8r,Dew it,It's not the jedi way.,iro,reddit

fj51fe6,"I know this sounds like a stupid question, but the guy I’m currently seeing said he’s not seeing anyone else and agreed to let me know if he does see someone else, and the same vice versa. Does that mean we are exclusive for now? I’m new to all this lol (I’m 25, just got out of a long distance relationship)",You're exclusive when you stop seeing other people.,iro,reddit

fj5c0s6,hell yeah,Mint chocolate chip.,iro,reddit

fj5e8r4,Well should I insert a picture of penis or something?,Not many. It's not original enough.,iro,reddit



In [17]:
import pandas as pd

# Leggi il dataset
train_dataset = pd.read_csv(train_dataset)
val_dataset = pd.read_csv(val_dataset)
dataset = pd.concat([train_dataset,val_dataset], axis=0)


# Filtra solo le colonne specificate
dataset = dataset[['id_original', 'text', 'parent_text', 'label']]

# Modifica i valori della colonna 'label'
dataset['label'] = dataset['label'].map({'iro': 'ironic', 'not': 'serious'})

# Visualizza il nuovo dataset
print(dataset)

             id_original                                               text   
0                fj5br8r                                             Dew it  \
1                fj51fe6  I know this sounds like a stupid question, but...   
2                fj5c0s6                                          hell yeah   
3                fj5e8r4  Well should I insert a picture of penis or som...   
4                fj5by78                                               Haha   
..                   ...                                                ...   
411  1538253852000063488  Wonder how much that clash of heads affected t...   
412  1538299467505577986  @peterswellman Puck outs still need work. Not ...   
413  1538280344331464706                                @Alanmc1885 Course!   
414  1538255178498990083       @DanDartsDawson Shocking tactics by them Dan   
415  1538413321975734272  @ShaneFontaine7 Married Colm 'the gooch' Coope...   

                                           parent_t

In [None]:
import re 
replace_user_mentions = lambda text: re.sub(r'@(\w+)\b', '@user', text, flags=re.IGNORECASE)

def preprocessing (df, parent_text, text):
    df["parent_text"] = df["parent_text"].apply(replace_user_mentions)
    df["text"] = df["text"].apply(replace_user_mentions)

    return df

In [None]:
import nltk
nltk.download('punkt')

def limit_token_length(df, column_name, max_tokens=250):
    """
    Limit the token length of strings in a specified column of a DataFrame.
    
    Parameters:
        df (DataFrame): The DataFrame containing the column to be modified.
        column_name (str): The name of the column to be modified.
        max_tokens (int): The maximum number of tokens allowed in a string.
        
    Returns:
        DataFrame: The DataFrame with modified strings in the specified column.
    """
    tokenized_column = df[column_name].apply(nltk.word_tokenize)
    truncated_tokens = tokenized_column.apply(lambda x: x[:max_tokens])
    truncated_strings = truncated_tokens.apply(' '.join)
    df[column_name] = truncated_strings
    return df

In [None]:
dataset = preprocessing(dataset, "parent_text", "text")
dataset = limit_token_length(dataset, "parent_text") #'@user' becomes '@ user'

In [18]:
dataset = dataset[dataset['label'].str.contains('ironic')]

#for not ironic
#dataset = dataset[dataset['label'].str.contains('serious')]

(474, 4)


In [8]:
#da aggiungere la prospettiva esempio: "Sei una donna di x anni"
instruction = "[INST] You are given a text (INPUT) and you have to generate a following ironic reply. (OUTPUT) [/INST]"
dataset['instr'] = dataset.apply(lambda row: f"<s> {instruction} [INPUT] {row['parent_text']} [/INPUT] [OUTPUT] {row['text']} [/OUTPUT] </s>", axis=1)

print(dataset['instr'][0])

<s> [INST] You are given a text (INPUT) and you have to generate a following ironic reply. (OUTPUT) [/INST] [INPUT] African Greys are supposed to be really smart and friendly. [/INPUT] [OUTPUT] And how much are we looking at for one of these [/OUTPUT] </s>


In [9]:
print(dataset['instr'].apply(lambda x: x[-100:]))
print(len(dataset))

0      smart and friendly. [/INPUT] [OUTPUT] And how ...
1      tertainment profits (movies, books, tv shows) ...
2      h of their corpses. [/INPUT] [OUTPUT] This is ...
3      reply. (OUTPUT) [/INST] [INPUT] Your mother [/...
4      reply. (OUTPUT) [/INST] [INPUT] Light switch [...
                             ...                        
253    UTPUT] @1womanandadog @MichaelTakeMP @phil_cou...
254    e mine, she will be sitting with her paw poise...
255     fully process… [/INPUT] [OUTPUT] @RealCandace...
256    PUT] @_nataliej__ Who you talking abt me ?? [/...
257    ned for more territory. I think he’s weak. I h...
Name: instr, Length: 258, dtype: object
258


In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [11]:
# Ottieni la lunghezza totale del dataset
#dataset_length = len(dataset)

# Campiona casualmente il dataset
#dataset_shuffled = dataset.shuffle(seed=512)

# Specifica la frazione dei dati che desideri mantenere (ad esempio, 20%)
#fraction_to_keep = 0.5

# Calcola la lunghezza del sottoinsieme
#subset_length = int(dataset_length * fraction_to_keep)

# Seleziona solo una parte del dataset
#subset_dataset = dataset_shuffled.select(list(range(subset_length)))

In [None]:
#print(subset_dataset['label'])

['ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic', 'ironic',

#**Step 4**: Loading and Training the Model

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

(True, True)

In [13]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

In [14]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [15]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="instr",
    tokenizer=tokenizer,
    args=training_params,
    packing= False,
)

# Train model
trainer.train()



Map:   0%|          | 0/258 [00:00<?, ? examples/s]



Step,Training Loss
25,2.0244
50,1.6784
75,1.6313
100,1.2851
125,1.3923




TrainOutput(global_step=130, training_loss=1.5738288695995624, metrics={'train_runtime': 276.558, 'train_samples_per_second': 1.866, 'train_steps_per_second': 0.47, 'total_flos': 2748597159395328.0, 'train_loss': 1.5738288695995624, 'epoch': 2.0})

In [16]:
# Sostituisci "path_to_save" con il percorso desiderato nel tuo drive
trainer.save_model(f"models/{gen}")

# trainer.save_model(f"./drive/MyDrive/modelli_prospettivi/modelli/{gen}") #Pier
# trainer.save_model(f"./drive/MyDrive/Progetti/INLG_preliminary_tests/modelli_prospettivisti/{gen}") #Marem

#**Step 5**: Sentence Generation

In [17]:
import re

parent_text = "In research, it's not enough to be 99.9% sure, you have to be 100% sure. How many of you agree??"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length= 500)
result = str(pipe(f"<s>[INST] {instruction} [/INST] [INPUT] {parent_text} [/INPUT] [OUTPUT]"))


match = re.search(r'\[OUTPUT\](.*?)\[/OUTPUT\]', result)
output = ""
if match:
    output = match.group(1)

print(output.strip())



The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyForCausalLM', 'MvpForCausalLM', 'OpenLlam

I'm not a researcher, but I'm pretty sure that's not true.


In [18]:
with open(f"{test_dataset}") as f:
  lines = f.readlines()
  for i, line in enumerate(lines):
    print(line)
    if i > 3:
      break

id_original,text,parent_text,label,source

1579729940236636162,@Aisha11ug Hoooo,"I am not an addict. I just have a f*cking  problem. When I want, I don't take no for an answer.",iro,twitter

1579719679345819648,@ProsaicView Yes. Corruption is deeply embedded in the system.,"@sumantbanerji Excellent piece Sumant -- one other reasons contractors give for not following safety measures is that lower level bureaucracy take so much bribe that to take care of their own margins, the contractor cuts costs to be bare bone. In return, the safety norms are not enforced",iro,twitter

1579064866916405248,@alwaysdaydreAMY Is it even a word??,"@johnj2555 Mine was awful lol Wordle 477 6/6





In [19]:
# Leggi il dataset
dataset = pd.read_csv(test_dataset)

# Filtra solo le colonne specificate
dataset = dataset[['id_original', 'text', 'parent_text', 'label']]

# Modifica i valori della colonna 'label'
dataset['label'] = dataset['label'].map({'iro': 'ironic', 'not': 'serious'})

# Visualizza il nuovo dataset
print(dataset)

             id_original                                               text  \
0    1579729940236636162                                   @Aisha11ug Hoooo   
1    1579719679345819648  @ProsaicView Yes. Corruption is deeply embedde...   
2    1579064866916405248               @alwaysdaydreAMY Is it even a word??   
3    1579720277877223424                 @ThePawanUpdates Congratulations ?   
4    1579720394873122817            @AshwiniMS_TNIE Many congratulations ??   
..                   ...                                                ...   
349  1538790614279520257  @Thogden Liverpool\nFulham\nForest\nLeicester ...   
350  1538253852000063488  Wonder how much that clash of heads affected t...   
351  1538299467505577986  @peterswellman Puck outs still need work. Not ...   
352  1538280344331464706                                @Alanmc1885 Course!   
353  1538255178498990083       @DanDartsDawson Shocking tactics by them Dan   

                                           parent_t

In [None]:
dataset = preprocessing(dataset, "parent_text", "text")
dataset = limit_token_length(dataset, "parent_text")

In [20]:
#dataset['instr'] = dataset.apply(lambda row: f"<s> {instruction} [CONTX] {row['parent_text']} [/CONTX] [TXT] {row['text']} [/TXT] [LABEL] ", axis=1)
dataset['instr'] = dataset.apply(lambda row: f"<s> {instruction} [INPUT] {row['parent_text']} [/INPUT] [OUTPUT]", axis=1)

print(dataset['instr'][0])

<s> [INST] You are given a text (INPUT) and you have to generate a following ironic reply. (OUTPUT) [/INST] [INPUT] I am not an addict. I just have a f*cking  problem. When I want, I don't take no for an answer. [/INPUT] [OUTPUT]


In [1]:
label_test = []
import csv
import pandas as pd

file_csv = f"{gen}_ironic_sentences.csv"
data_path = "output_sentences"
# data_path = "./drive/MyDrive/modelli_prospettivi" #Pier
# data_path = "./drive/MyDrive/Progetti/INLG_preliminary_tests/modelli_prospettivisti" #Marem
rows = []
row = {}
for i in range (0, len(dataset)):
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
  try:
    result = str(pipe(dataset['instr'][i]))
    print(result)
    match = re.search(r'\[OUTPUT\](.*?)\[/OUTPUT\]', result)
    output = ""
    if match:
        output = match.group(1)
    else:
        # match = re.search(r'\[OUTPUT\](.*?)', result)
        match = re.search(r'\[OUTPUT\]\s*(.*)', result)
        output = "<incomplete_generation>"+match.group(1)
  except:
    output = "max_token_generation_limit"

  row = {
    'id' :  dataset['id_original'][i],
    'gold_sentence': dataset['text'][i],
    'generated_sentece':[]
    }

  row['generated_sentece'].append(output)
  rows.append(row)

  # Crea un DataFrame da rows
  df = pd.DataFrame(rows)
  df.to_csv(f'{data_path}/{gen}/{file_csv}', index=False)


SyntaxError: invalid syntax (997096030.py, line 7)