<a href="https://colab.research.google.com/github/BriceMichalski/BriceMichalski/blob/main/I2MO_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Env

In [None]:
!pip uninstall -y tensorflow
!pip install -q datasets accelerate evaluate trl accelerate bitsandbytes peft

In [2]:
# UNZIP DATASET
import zipfile
import os

zip_file_path = "dataset.zip"
output_directory = "."

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_directory)

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments, Trainer
from accelerate import Accelerator
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training
from trl import SFTTrainer,SFTConfig
import torch
import os
import shutil

In [4]:
# SETTINGS
DATASET_SIZE = 10
BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

OUTPUT_DIR = "./workdir"
FINAL_MODEL_DIR = "./build"
DATASET_PATH = "./dataset"
HF_TOKEN = "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"
FINAL_MODEL_ID = f"{BASE_MODEL_ID.split('/')[1]}-i2mo-{DATASET_SIZE}"

accelerator = Accelerator()

## Load Dataset

In [None]:
def load_articles_dataset(directory):
    dataset = []
    for filename in os.listdir(directory):
        if filename.endswith(".md") and len(dataset) < DATASET_SIZE:
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                dataset.append({"text": content})
    return dataset

articles = load_articles_dataset(DATASET_PATH)
dataset = Dataset.from_list(articles)

print(dataset)


## Dataset Tokenization

In [None]:
#
# 2. Dataset Tokenization
#
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Pad token : {tokenizer.pad_token}")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=1024
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

## Load Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    token=HF_TOKEN,
    quantization_config=bnb_config,
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

## Set up PEFT (Parameter-Efficient Fine-Tuning)

In [9]:
config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, config)

## Prepare trainer

In [None]:
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/{FINAL_MODEL_ID}",
    evaluation_strategy="no",
    per_device_train_batch_size=4,  # Plus grand batch size possible avec le GPU
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    bf16=True,  # Utilisation du calcul en FP16 pour accélérer l'entraînement sur le GPU
)

trainer = SFTTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

## Train Model

In [None]:
trainer.train()

## Merge the adapter and model back together

In [None]:
adapter_model = trainer.model
merged_model = adapter_model.merge_and_unload()

trained_tokenizer = trainer.tokenizer

In [13]:
trained_tokenizer.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")
merged_model.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")
merged_model.config.save_pretrained(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")

## Zip model

In [None]:
shutil.make_archive(f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}", 'zip', f"{FINAL_MODEL_DIR}/{FINAL_MODEL_ID}")

## Clear

In [17]:
# del model
del trainer
torch.cuda.empty_cache()