# Fine Tune Mistral for hate speech classification

In [1]:
# %%capture
# # Installs Unsloth, Xformers (Flash Attention) and all other packages!
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps  "trl<0.9.0" peft accelerate bitsandbytes datasets #"xformers<0.0.27"
# !pip install -U xformers --index-url https://download.pytorch.org/whl/cu121


In [2]:
# %%capture
# !pip install triton
# # !pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit" ]
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA H100 80GB HBM3. Max memory: 79.109 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 9.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

We also add `embed_tokens` and `lm_head` to allow the model to learn out of distribution data.

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


<a name="Data"></a>
### Data Prep

In [5]:
from datasets import load_dataset
import pandas as pd
# Load the QNLI dataset
# label mapping (0 for entailment, 1 for contradiction,)
qnli_dataset = load_dataset("glue", "qnli")
qnli_train_df = pd.DataFrame(qnli_dataset['train'])
qnli_train_df = qnli_train_df[qnli_train_df['label'] == 0]
qnli_validation_df = pd.DataFrame(qnli_dataset['validation'])
qnli_test_df = pd.DataFrame(qnli_dataset['test'])
qnli_test_df = qnli_test_df[qnli_test_df['label'] == 0]


qnli_train_df.head()

Unnamed: 0,question,sentence,label,idx
2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,0,2
3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",0,3
5,When is the term 'German dialects' used in reg...,"When talking about the German language, the te...",0,5
6,What was the name of the island the English tr...,"At the end of the Second Anglo-Dutch War, the ...",0,6
8,What does the word 'customer' properly apply to?,The bill also required rotation of principal m...,0,8


In [6]:
# len(qnli_train_df)

In [7]:
# import pandas as pd
# from sklearn.model_selection import train_test_split


# file_path = '/content/questions.csv'

# all_df = pd.read_csv(file_path)

# SEED = 42
# train_percentage = 0.8
# test_percentage = 0.2

# # Split the data into training and test sets
# df, test_df = train_test_split(
#     all_df,
#     test_size=test_percentage,  # 20% for test
#     random_state=SEED,
#     shuffle=True,
# )

# # Verify the splits
# print(f"Training set size: {len(df)}")
# print(f"Test set size: {len(test_df)}")
# df.head()
# from datasets import Dataset
# dataset = Dataset.from_pandas(df)
# print(dataset)


# t_dataset = Dataset.from_pandas(test_df)
# print(t_dataset)

# Create question answering data for the finetuning

In [8]:
import pandas as pd
import random
import tqdm


def create_qnli_dataframe(df):
    data = []
    for idx, row in df.iterrows():
        question = row['question']
        correct_answer = row['sentence']
        incorrect_answers = df[df['sentence'] != correct_answer]['sentence'].sample(4).tolist()
        all_answers = incorrect_answers + [correct_answer]
        random.shuffle(all_answers)
        correct_answer_column = f"{chr(all_answers.index(correct_answer) + ord('A'))}"

        answers_dict = {
            'question': question,
            'A': all_answers[0],
            'B': all_answers[1],
            'C': all_answers[2],
            'D': all_answers[3],
            'E': all_answers[4],
            'correct_answer': correct_answer_column
        }
        data.append(answers_dict)
    new_df = pd.DataFrame(data)

    return new_df

In [9]:
sampled_df = qnli_train_df.sample(frac=0.05, random_state=42)  # random_state ensures reproducibility

new_df = create_qnli_dataframe(sampled_df)
new_df

Unnamed: 0,question,A,B,C,D,E,correct_answer
0,Who did NASA recruit by using flawed safety nu...,Others have argued that excessive regulation s...,The ships are due to become operational from 2...,He concluded that the space shuttle reliabilit...,Japanese (Famicom) cartridges are shaped sligh...,Kathmandu is home to a number of museums and a...,C
1,How much solar energy is captured by photosynt...,It was subject to controversy and strict regul...,"Upon independence, declared in 1973 and recogn...","Photosynthesis captures approximately 3,000 EJ...","Chroniclers recorded that John had a ""mad infa...",The Apple's Last Resort font will display a su...,C
2,What does the CAR get help with with regards t...,"The music of the Romantic era, from roughly th...","In addition, the Central African Republic rece...","In 1985, Jasmine Multimedia created LaserDisc ...",Among the achievements are an increase of pipe...,In conclusion the hair colour of young Greeks ...,B
3,"On Indian Independence Day, kites are flown by...",Livy offers a detailed account of the devotio ...,"The attraction closed on August 30, 2014.","In 1999, the club celebrated its centenari, wi...",Most Delhiites celebrate the day by flying kit...,The archaeologist and adventurer hero Indiana ...,D
4,What types of Christianity do Quakers belong to?,"They include those with evangelical, holiness,...","A variety of groups that predated punk, such a...","Tito visited India from December 22, 1954 thro...","In September 1695, Captain Henry Every, an Eng...","Lee, with Goodman's approval, published the st...",A
...,...,...,...,...,...,...,...
2614,When does the spread of antibacterial resistan...,"Following suburbanization, industrial restruct...","Brick was the ordinary building material, and ...","Whatever human beings perceive is composite, w...",The Boston Globe and the Boston Herald are two...,The spread of antibacterial resistance often o...,E
2615,What's window shapes di Czech Cubist architec...,"By March 2 (UTC), 2010, owners of original PS3...",Though earlier approaches to translation are l...,"Thus, new forms of windows and doors were also...","In 1955, DC Sinclair and G Weddell developed p...","In 1974, there were 475 institutes of higher e...",C
2616,Who said the term 'intellectual property' shou...,The meatiest parts of a bird are the flight mu...,Royal assent is sometimes associated with elab...,While the Paris region's population accounted ...,"In December 2008, the University of Michigan B...",Free Software Foundation founder Richard Stall...,E
2617,When did it reach 2 million occupants>,Ailaa and thwon (alcohol made from rice) are t...,According to family economic and financial edu...,It became a prime destination for African-Amer...,Montana is the home of the Federation of Fly F...,"Thomas served as Fire Chief until June 2008, a...",C


In [10]:
len(new_df)

2619

##### Data formatting

In [11]:
import os, re
from datasets import Dataset, load_dataset

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func(examples):
    instruction = "Pick an answer for the following question and resturn only the name of the answer 'A', 'B', 'C', 'D', or 'E'. A question can have multiple answers."
    # inputs  = f"""Question: {examples.question}. A: {examples.A}, B: {examples.B}, C: {examples.C}, D: {examples.D}, E: {examples.E}"""
    inputs =  [f" Question :{examples.question}", f" Answer A :{examples.A}", f" Answer A :{examples.B}", f" Answer A :{examples.C}", f" Answer A :{examples.D}", f" Answer A :{examples.E}"]
    outputs = examples['correct_answer']
    texts   = []
# def formatting_prompts_func(examples):
#     instruction = "Select one of the folloiwng answers for the question:"
#     inputs  = examples['question']
#     answers = [f" Answer A :{examples.A}",
#                f" Answer A :{examples.B}",
#                f" Answer A :{examples.C}",
#                f" Answer A :{examples.D}",
#                f" Answer A :{examples.E}"
#                ]
#     outputs = examples.correct_answer
    # texts   = []

    for input_text, output_text in zip(inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        texts.append(text)

    return { "texts": texts }

train_dataset_df = formatting_prompts_func(new_df)


# df = qnli_train_df.drop(['idx', 'label'], axis=1)
# train_dataset= Dataset.from_pandas(df)
# train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
# test_df = qnli_test_df.drop(['idx', 'label'], axis=1)
# test_dataset = Dataset.from_pandas(test_df)
# test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

In [12]:
print(train_dataset_df)

{'texts': ["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nPick an answer for the following question and resturn only the name of the answer 'A', 'B', 'C', 'D', or 'E'. A question can have multiple answers.\n\n### Input:\n Question :0       Who did NASA recruit by using flawed safety nu...\n1       How much solar energy is captured by photosynt...\n2       What does the CAR get help with with regards t...\n3       On Indian Independence Day, kites are flown by...\n4        What types of Christianity do Quakers belong to?\n                              ...                        \n2614    When does the spread of antibacterial resistan...\n2615    What's  window shapes di Czech Cubist architec...\n2616    Who said the term 'intellectual property' shou...\n2617               When did it reach 2 million occupants>\n2618    What is the lowest error rate that oc

In [13]:
train_dataset = dataset = Dataset.from_dict(train_dataset_df)

In [14]:
# import os, re
# from datasets import Dataset, load_dataset

# alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""


# EOS_TOKEN = tokenizer.eos_token

# def formatting_prompts_func(examples):
#     instruction = "Answer the following question"
#     inputs  = examples['question']
#     outputs = examples['sentence']
#     texts   = []
# # def formatting_prompts_func(examples):
# #     instruction = "Select one of the folloiwng answers for the question:"
# #     inputs  = examples['question']
# #     answers = [f" Answer A :{examples.A}",
# #                f" Answer A :{examples.B}",
# #                f" Answer A :{examples.C}",
# #                f" Answer A :{examples.D}",
# #                f" Answer A :{examples.E}"
# #                ]
# #     outputs = examples.correct_answer
#     # texts   = []

#     for input_text, output_text in zip(inputs, outputs):
#         text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
#         texts.append(text)

#     return { "texts": texts }


# df = qnli_train_df.drop(['idx', 'label'], axis=1)
# train_dataset= Dataset.from_pandas(df)
# train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
# test_df = qnli_test_df.drop(['idx', 'label'], axis=1)
# test_dataset = Dataset.from_pandas(test_df)
# test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

In [15]:
# train_dataset['texts']

In [16]:
# import pandas as pd
# train_df = pd.DataFrame(train_dataset)
# train_df.head()

In [17]:
# from datasets import load_dataset
# import torch

# # Load your Hugging Face dataset (for example, the MNLI dataset)
# dataset =  train_dataset
# # Define a function to convert the necessary columns to BFloat16
# def convert_to_bfloat16(example):
#     # If the example contains tensors or lists, convert them to BFloat16
#     for key, value in example.items():
#         if isinstance(value, torch.Tensor):
#             example[key] = value.to(torch.bfloat16)
#     return example

# # Apply this function to your dataset (this assumes your dataset contains tensors)
# train_dataset_2 = dataset.map(convert_to_bfloat16)

In [18]:
# model = model.to(torch.bfloat16)


In [19]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "texts",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    # args = TrainingArguments(
    #     per_device_train_batch_size = 2,
    #     gradient_accumulation_steps = 4,
    #     warmup_steps = 5,
    #     max_steps = 100,
    #     learning_rate = 2e-4,
    #     # fp16 = True,
    #     bf16 = True,
    #     # fp16 = not torch.cuda.is_bf16_supported(),
    #     # bf16 = torch.cuda.is_bf16_supported(),
    #     logging_steps = 1,
    #     # optim = "adamw_8bit",
    #     weight_decay = 0.01,
    #     lr_scheduler_type = "linear",
    #     seed = 3407,
    #     output_dir = "outputs",
    #     report_to = "none",
    # ),
    args = TrainingArguments(
        # per_device_train_batch_size=2,
        # gradient_accumulation_steps=4,
        # warmup_steps=5,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,  # Use fp16 instead of bf16
        # logging_steps=1,
        weight_decay=0.01,
        lr_scheduler_type="linear",
        # seed=3407,
        output_dir="outputs",
        # report_to="none",
    )
)

Map (num_proc=2): 100%|██████████| 6/6 [00:00<00:00, 28.62 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [20]:
#@title Show current memory stats
import torch
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 80GB HBM3. Max memory = 79.109 GB.
6.367 GB of memory reserved.


In [21]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6 | Num Epochs = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 603,979,776


Step,Training Loss


In [22]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

56.4432 seconds used for training.
0.94 minutes used for training.
Peak reserved memory = 15.262 GB.
Peak reserved memory for training = 8.895 GB.
Peak reserved memory % of max memory = 19.292 %.
Peak reserved memory for training % of max memory = 11.244 %.


In [23]:
# model.save_pretrained("/content/drive/MyDrive/Models/hasoc2020_all_Mistral_model")


### Inference
Let's run the model!


In [24]:
test_df.head()

NameError: name 'test_df' is not defined

In [26]:
test_questions = pd.read_csv("/home/admin/questions.csv")
test_questions

Unnamed: 0,id,question,answer_A,answer_B,answer_C,answer_D,answer_E
0,0,Devant un exanthème roséoliforme fébrile de l'...,Un exanthème subit,Un mégalérythème épidémique,Une rubéole,Une mononucléose infectieuse,Un syndrome de kawasaki
1,1,"A propos de l’insuffisance cardiaque, quelle(s...",L’auscultation cardiaque peut mettre en éviden...,L’auscultation cardiaque peut mettre en éviden...,La turgescence jugulaire constitue un signe pé...,"Les œdèmes périphériques sont mous, bleus et d...",Les râles crépitants ou sous-crépitants sont s...
2,2,Quelle(s) est (sont) la (les) réponse(s) vraie...,Drainage bronchique quotidien,Corticothérapie systémique,Lobectomie pulmonaire,Cure d’antibiothérapie trimestrielle systématique,Traitement anti-inflammatoire par fluoroquinolone
3,3,Une patiente de 58 ans vient aux urgences de ...,Vous hospitalisez la patiente,Vous mettez en place un remplissage vasculaire,Vous prescrivez un agent vasoconstricteur par ...,Vous prescrivez une tomodensitométrie thoraciq...,Vous rassurez la patiente et lui expliquez qu’...
4,4,Un patient de 55 ans présentant un hyperlympho...,Aucun,Un tep-scanner,Une échocardiographie,Un myélogramme,Une scintigraphie osseuse
...,...,...,...,...,...,...,...
98,98,"Concernant la maladie d'alzheimer, quelle(s) a...",Le diagnostic reste souvent tardif,Il s’agit d’une démence secondaire,Les troubles du comportement se déclarent souv...,Le test des 5 mots de dubois permet de poser l...,Elle évolue par poussée conduisant à des hospi...
99,99,"Concernant le vieillissement des organes, quel...",Il est marqué par une diminution des capacités...,Il est dû essentiellement au stress oxydant,Il s'agit d'un concept sans preuve scientifique,Il est caractérisé par une réduction des capac...,Il permet de juger l’autonomie de l’individu
100,100,"Concernant le syndrome de fragilité, quelle(s)...",Il s’agit d’un concept sans définition précise,Il permet d'évaluer les réserves fonctionnelle...,Il est associé à un risque de dépendance,Il est associé au risque d’escarre pendant l’h...,Il est associé à une augmentation du risque d...
101,101,Vous prenez en charge aux urgences une patient...,Administration immédiate de concentré de compl...,Administration de vitamine k dans 4 heures,Scanner cérébral de contrôle systématique dans...,Arrêt de l’antivitamine k avec relais par hépa...,Arrêt de l’antivitamine k


In [40]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time
# Assuming you have a function to split the DataFrame into batches
def batch_iter(df, batch_size=32):
    """Splits the DataFrame into batches of the specified size."""
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

start_time = time.time()

# Iterate over each batch in the DataFrame
for batch_df in batch_iter(test_questions, batch_size=32):
    answers = []

    for index, row in batch_df.iterrows():
        # Prepare the input prompt
        input_text = alpaca_prompt.format(
            "Pick an answer for the following question and resturn only the name of the answer 'A', 'B', 'C', 'D', or 'E'. A question can have MULTIPLE answers.",
            f"""Question: {row["question"]}. A: {row["answer_A"]}, B: {row["answer_B"]}, C: {row["answer_C"]}, D: {row["answer_D"]}, E: {row["answer_E"]}""",
            "" # output - leave this blank for generation!
        )

        inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
                # Decode the generated output
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        # print("decoded output = ", decoded_output)
        # Extract the response part after "### Response:"
        response_text = decoded_output.split("### Response:")[-1].strip()
        # print("result = ",response_text)
        # answer_array = result.cpu().numpy().decode("utf-8").replace(' ', '').split(',')

        answer_array = response_text.replace(' ','').split(',')
        # answer_array.sort()
        answer = ','.join(answer_array)
        print(f"Question: {row['question']}")
        print(f"Response: {answer}")
        answers.append(answer)

    # Convert batch results to DataFrame
output_df = pd.DataFrame(answers, columns=["Answer"])
output_df.index.name = "id"

output_df.to_csv("./output_QNLI_Tuned_model.csv")
print(f"Time taken: {time.time()-start_time}")


Question: Devant un exanthème roséoliforme fébrile de l'enfant, les principales étiologies sont (une ou plusieurs réponsesexactes):
Response: B
Question: A propos de l’insuffisance cardiaque, quelle(s) est (sont) la (les) proposition(s) vraie(s) ?
Response: B
Question: Quelle(s) est (sont) la (les) réponse(s) vraie(s) concernant la prise en charge des bronchectasies diffuses chez l’adulte (endehors d’un contexte de mucoviscidose) responsables d’épisodes infectieux à répétition ?
Response: C
Question: Une patiente de 58 ans  vient aux urgences de votre hôpital pour l’expectoration de deux verres de sang rouge lors d’uneffort de toux. elle n’avait jamais craché de sang. elle est sous aspirine à visée anti-agrégante pour une coronaropathie.son examen clinique est normal ; sa pression artérielle est à 132 /79 mmhg. la fréquence cardiaque est à 80/mn. laradiographie pulmonaire ne montre pas d’anomalie.donnez la ou les réponse(s) juste(s)
Response: B
Question: Un patient de 55 ans présentant

In [41]:
output_df

Unnamed: 0,Answer
0,C
1,B
2,B
3,B
4,C
5,B
6,C
