<a href="https://colab.research.google.com/github/BriceMichalski/BriceMichalski/blob/main/I2MO_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Env

In [6]:
# UNZIP DATASET
import zipfile
import os

zip_file_path = "dataset.zip"
output_directory = "."

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_directory)

In [1]:
!pip install datasets transformers trl accelerate
!pip uninstall -y tensorflow

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [5]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments, Trainer
from trl import SFTTrainer,SFTConfig
from accelerate import Accelerator

import os

In [3]:
# SETTINGS
HF_TOKEN = "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"
BASE_MODEL_ID = "mistralai/Mistral-7B-v0.1"
DATASET_PATH = "./dataset"
DATASET_SIZE = 2000
OUTPUT_DIR = "./build"
FINAL_MODEL_ID = f"{BASE_MODEL_ID.split('/')[1]}-i2mo-{DATASET_SIZE/1000}k"

accelerator = Accelerator()

## Load Dataset

In [6]:


def load_articles_dataset(directory):
    dataset = []
    for filename in os.listdir(directory):
        if filename.endswith(".md"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                dataset.append({"text": content})
    return dataset

articles = load_articles_dataset(DATASET_PATH)
dataset = Dataset.from_list(articles)

print(dataset)


Dataset({
    features: ['text'],
    num_rows: 4100
})


## Dataset Tokenization

In [7]:
#
# 2. Dataset Tokenization
#
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Pad token : {tokenizer.pad_token}")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=1024
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

Pad token : </s>


Map:   0%|          | 0/4100 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 4100
})


## Load Model

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    token=HF_TOKEN,
    device_map="auto"
)
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding(32000, 4096)

## Prepare trainer

In [None]:
sft_config = SFTConfig(
    output_dir=f"{OUTPUT_DIR}/{FINAL_MODEL_ID}",
    num_train_epochs=1, # replace this, depending on your dataset
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    optim="adamw_torch",
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=sft_config,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4100 [00:00<?, ? examples/s]

## Train Model

In [None]:
trainer.train()