In [1]:
! pip install datasets
! pip install peft
! pip install transformers
! pip install transformers[sentencepiece]
! pip install trl

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess

In [2]:
import json
import pandas as pd
from datasets import Dataset
import os

In [3]:
# ========== LOAD MODEL AND TOKENIZER  ==========
from transformers import AutoModelForCausalLM, AutoTokenizer, MistralForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
# from accelerate import Accelerator

token = [insert token]

# Define the model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, device_map='auto')

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, token=token, device_map='auto')

# =========== PEFT ===========
from peft import LoraConfig, TaskType, get_peft_model

# Load the PEFT configuration and apply it to the model
print("Configuring PEFT...")
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) # Changed task type to CAUSAL_LM
print("Getting PEFT model")
# model = get_peft_model(model, peft_config)


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Configuring PEFT...
Getting PEFT model


In [4]:
# ========== LOAD CUSTOM DATASET ==========
# Load the JSON data file
with open('fine_tuning_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Create a Hugging Face Dataset object
dataset = Dataset.from_pandas(df)

# Shuffle the entire dataset before splitting
dataset = dataset.shuffle(seed=42)

# Split the dataset into train and test sets
dataset = dataset.train_test_split(test_size=0.2)
validation_train_split = dataset['train'].train_test_split(test_size=0.2)  # 0.2 of the training dataset for validation
dataset["validation"] = validation_train_split["test"]
dataset["train"] = validation_train_split["train"]

print("Length of training dataset:", len(dataset["train"]))
print("Length of validation dataset:", len(dataset["validation"]))
print("Length of test dataset:", len(dataset["test"]))
print("Finished loading dataset")

# # Function to flatten the dataset structure
# def flatten_context(context):
#     flattened = []
#     for item in context:
#         if isinstance(item, list):
#             flattened.extend(item)
#         else:
#             flattened.append(item)
#     return " ".join(map(str, flattened))

# # Preprocess function to tokenize the dataset
# def preprocess_function(examples):
#     # Flatten the context
#     inputs = [flatten_context(context) for context in examples["context"]]
#     # Tokenize the inputs
#     model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
#     print("Tokenized inputs:", model_inputs[0])  # Debug print to check tokenized inputs
#     return model_inputs

# print("Create tokenized datasets")
# tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["context", "question", "answers"])

# print("Length of tokenized training dataset:", len(tokenized_datasets["train"]))
# print("Length of tokenized validation dataset:", len(tokenized_datasets["validation"]))
# print("Length of tokenized test dataset:", len(tokenized_datasets["test"]))

# # print(tokenized_datasets['train'][1])

Length of training dataset: 576
Length of validation dataset: 144
Length of test dataset: 180
Finished loading dataset


In [8]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import torch


# def formatting_prompts_func(example):
#     output_texts = []
#     for i in range(len(example)):
#         text = f"### Question: {example[i]['question']}\n ### Context: {example[i]['context']}\n  ### Answer: {example[i]['answers']}"
#         output_texts.append(text)
#     return output_texts

# def formatting_prompts_func(example):
#     output_texts = []
#     text = f"### Question: {example['question']}\n ### Context: {example['context']}\n  ### Answer: {example['answers']}"
#     output_texts.append(text)
#     return output_texts

token = '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'

def formatting_prompts_func(example):
    questions = example['question']
    contexts = example['context']
    answers = example['answers']

    output_texts = []
    for question, context, answer in zip(questions, contexts, answers):
        text = f"### Question: {question}\n ### Context: {context}\n  ### Answer: {answer}"
        output_texts.append(text)
    return output_texts

# print(formatting_prompts_func(dataset["train"]))

response_template = "### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, mlm=False)

sft_config = SFTConfig(
    # dataset_text_field="answers",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    max_seq_length=1300,
    output_dir="SFT_Mistral_7B",
    hub_model_id="EllaScheltinga/SFT-Mistral-7B", 
    push_to_hub=True,
    hub_token=token,
    logging_steps=100
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    args=sft_config,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,

)

# Ensure GPU memory management settings
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
torch.cuda.empty_cache()

trainer.train()

# Save the model and tokenizer to the Hugging Face Hub
trainer.push_to_hub()
tokenizer.push_to_hub("EllaScheltinga/SFT-Mistral-7B", token=token)

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,0.6775,0.582097
2,0.5916,0.556711
3,0.5204,0.55078



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.3 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.3.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.3 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.3.


events.out.tfevents.1723197818.192-222-52-200.4076.2:   0%|          | 0.00/7.01k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/EllaScheltinga/SFT-Mistral-7B/commit/c3c7c53d1ce436d57be42d5f826b39a9bc86d7d8', commit_message='Upload tokenizer', commit_description='', oid='c3c7c53d1ce436d57be42d5f826b39a9bc86d7d8', pr_url=None, pr_revision=None, pr_num=None)