### Installations

In [None]:
! pip install datasets
! pip install peft
! pip install transformers
! pip install transformers[sentencepiece]
! pip install trl

### Imports

In [None]:
import json
import pandas as pd
from datasets import Dataset
import os

### Model

In [None]:
# ========== LOAD MODEL AND TOKENIZER  ==========
from transformers import AutoModelForCausalLM, AutoTokenizer, MistralForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
# from accelerate import Accelerator

token = # Your Hugging Face API token in ""

# Define the model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, device_map='auto')

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, token=token, device_map='auto')

# =========== PEFT ===========
from peft import LoraConfig, TaskType, get_peft_model

# Load the PEFT configuration and apply it to the model
print("Configuring PEFT...")
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) # Changed task type to CAUSAL_LM
print("Getting PEFT model")


### Dataset

In [None]:
# ========== LOAD CUSTOM DATASET ==========
# Load the JSON data file

data = # Load your JSON data file path

# Convert JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Create a Hugging Face Dataset object
dataset = Dataset.from_pandas(df)

# Shuffle the entire dataset before splitting
dataset = dataset.shuffle(seed=42)

# Split the dataset into train and test sets
dataset = dataset.train_test_split(test_size=0.2)
validation_train_split = dataset['train'].train_test_split(test_size=0.2)  # 0.2 of the training dataset for validation
dataset["validation"] = validation_train_split["test"]
dataset["train"] = validation_train_split["train"]

print("Length of training dataset:", len(dataset["train"]))
print("Length of validation dataset:", len(dataset["validation"]))
print("Length of test dataset:", len(dataset["test"]))
print("Finished loading dataset")

### Fine-tuning

In [None]:
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import torch

def formatting_prompts_func(example):
    questions = example['question']
    contexts = example['context']
    answers = example['answers']

    output_texts = []
    for question, context, answer in zip(questions, contexts, answers):
        text = f"### Question: {question}\n ### Context: {context}\n  ### Answer: {answer}"
        output_texts.append(text)
    return output_texts

response_template = "### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, mlm=False)

sft_config = SFTConfig(
    # dataset_text_field="answers",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    max_seq_length=1300,
    output_dir="SFT_Mistral_7B",
    hub_model_id="EllaScheltinga/SFT-Mistral-7B", 
    push_to_hub=True,
    hub_token=token,
    logging_steps=100
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    args=sft_config,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,

)

# Ensure GPU memory management settings
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
torch.cuda.empty_cache()

trainer.train()

# Save the model and tokenizer to the Hugging Face Hub
trainer.push_to_hub()
tokenizer.push_to_hub("EllaScheltinga/SFT-Mistral-7B", token=token)