<a href="https://colab.research.google.com/github/BriceMichalski/BriceMichalski/blob/main/I2MO_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Env

In [1]:
!pip uninstall -y tensorflow
!pip install -q datasets accelerate evaluate trl accelerate bitsandbytes peft

Found existing installation: tensorflow 2.17.0
Uninstalling tensorflow-2.17.0:
  Successfully uninstalled tensorflow-2.17.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.4/318.4 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# UNZIP DATASET
import zipfile
import os

zip_file_path = "dataset.zip"
output_directory = "."

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_directory)

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments, Trainer
from accelerate import Accelerator
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training
from trl import SFTTrainer,SFTConfig
import torch
import os
import shutil

In [4]:
# SETTINGS
DATASET_SIZE = 100
BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

OUTPUT_DIR = "./workdir"
FINAL_MODEL_DIR = "./build"
DATASET_PATH = "./dataset"
HF_TOKEN = "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"
FINAL_MODEL_ID = f"{BASE_MODEL_ID.split('/')[1]}-i2mo-{DATASET_SIZE}"

accelerator = Accelerator()

## Load Dataset

In [5]:
def load_articles_dataset(directory):
    dataset = []
    for filename in os.listdir(directory):
        if filename.endswith(".md") and len(dataset) < DATASET_SIZE:
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                dataset.append({"text": content})
    return dataset

articles = load_articles_dataset(DATASET_PATH)
dataset = Dataset.from_list(articles)

print(dataset)


Dataset({
    features: ['text'],
    num_rows: 100
})


## Dataset Tokenization

In [7]:
#
# 2. Dataset Tokenization
#
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Pad token : {tokenizer.pad_token}")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=1024
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

Pad token : </s>


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 100
})


## Load Model

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    token=HF_TOKEN,
    quantization_config=bnb_config,
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## Set up PEFT (Parameter-Efficient Fine-Tuning)

In [9]:
config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

## Prepare trainer

In [10]:
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/{FINAL_MODEL_ID}",
    evaluation_strategy="no",
    per_device_train_batch_size=4,  # Plus grand batch size possible avec le GPU
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    bf16=True,  # Utilisation du calcul en FP16 pour accélérer l'entraînement sur le GPU
)

trainer = SFTTrainer(
    args=training_args,
    model=model,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)



## Train Model

In [11]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss




TrainOutput(global_step=25, training_loss=1.584232940673828, metrics={'train_runtime': 252.6318, 'train_samples_per_second': 0.396, 'train_steps_per_second': 0.099, 'total_flos': 4422986681548800.0, 'train_loss': 1.584232940673828, 'epoch': 1.0})

## Merge the adapter and model back together

In [12]:
adapter_model = trainer.model
merged_model = adapter_model.merge_and_unload()

trained_tokenizer = trainer.tokenizer



In [13]:
trained_tokenizer.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")
merged_model.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")
merged_model.config.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")

## Zip model

In [15]:
shutil.make_archive(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}", 'zip', f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")

'/content/build/Mistral-7B-Instruct-v0.3-i2mo-100.zip'

## Clear

In [17]:
# del model
del trainer
torch.cuda.empty_cache()