In [1]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the JSON data
with open('jacksparrow_modified.json', 'r') as f:
    data = json.load(f)

# Prepare the data
conversations = data['conversations']

# Separate the conversations into human and gpt inputs
human_inputs = []
gpt_outputs = []

for i in range(len(conversations) - 1):
    if conversations[i]['from'] == 'human' and conversations[i + 1]['from'] == 'gpt':
        human_inputs.append(conversations[i]['value'])
        gpt_outputs.append(conversations[i + 1]['value'])

# Create a dataset suitable for fine-tuning
train_data = [f"Human: {h}\nGPT: {g}" for h, g in zip(human_inputs, gpt_outputs)]
dataset = Dataset.from_dict({'text': train_data})

# Load the tokenizer and model
model_name = "MaziyarPanahi/Phi-3-mini-4k-instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define training arguments with adjusted learning rate
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=300,
    fp16=True,  # Enable mixed precision training if supported by GPU
    learning_rate=5e-5  # Adjust the learning rate here
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("fine-tuned-phi-3-mini")
tokenizer.save_pretrained("fine-tuned-phi-3-mini")


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [03:46<00:00, 113.09s/it]
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:01<00:00,  1.07it/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 316/316 [00:00<00:00, 11608.60 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You are not running the flash-attention implementation, expect numerical differences.


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 