In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import json
import torch
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader

In [2]:
# Load the MiniCPM model and tokenizer
model = AutoModelForCausalLM.from_pretrained("openbmb/MiniCPM-Llama3-V-2_5-int4", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-Llama3-V-2_5-int4", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.48s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
with open("jacksparrow_modified.json") as f:
    data = json.load(f)

# Flatten the conversations into a list of strings with role context
def preprocess_conversations(conversations):
    dialogue = []
    current_speaker = None
    current_text = ""
    
    for message in conversations:
        speaker = "Me" if message["from"] == "human" else "Jack Sparrow"
        if speaker != current_speaker:
            if current_text:
                dialogue.append(current_text)
            current_text = f"{speaker}: {message['value']}\n"
            current_speaker = speaker
        else:
            current_text += f"{speaker}: {message['value']}\n"
    
    if current_text:
        dialogue.append(current_text)
    
    return dialogue

texts = preprocess_conversations(data["conversations"])

# Convert the list of strings to a Dataset
dataset = Dataset.from_dict({"text": texts})

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_outputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    # Ensure no empty tokens
    for i, tokens in enumerate(tokenized_outputs['input_ids']):
        if all(token == tokenizer.pad_token_id for token in tokens):
            print(f"Warning: Empty token sequence at index {i}")
    return tokenized_outputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check dataset length
print(f"Dataset length: {len(tokenized_dataset)}")

# Split the dataset into training and validation sets (if needed)
split_datasets = tokenized_dataset.train_test_split(test_size=0.1)

# Ensure split datasets are non-empty
print(f"Training set length: {len(split_datasets['train'])}")
print(f"Validation set length: {len(split_datasets['test'])}")

# Print a sample of tokenized data
print(f"Sample tokenized data: {tokenized_dataset[0]}")

Map: 100%|██████████| 633/633 [00:00<00:00, 5569.00 examples/s]

Dataset length: 633
Training set length: 569
Validation set length: 64
Sample tokenized data: {'text': "Me: The top of a billowing sail passes regally in front of them. On the landward face of the sail, apparently high in the rigging, is a man for whom the term 'swashbuckling rogue' was coined: Captain JACK SPARROW. He gazes keen-eyed at the display as they pass. Raises a tankard in salute. Suddenly, something below catches his attention. He jumps from the rigging - -- and that's when we see that his is ship is not an imposing three-master, but just a small fishing dory with a single sail, plowing through the water -- the Jolly Mon. And it leaks. Which is why he has the tankard: to bail. Jack steps back to the tiller, and using a single sheet to control the sail, and the Jolly Mon comes around the promontory, the whole of Port Royal laid out before him. The huge British dreadnought, H.M.S. Dauntless dominates the bay. But Jack's attention is on a different ship: the H.M.S. Interceptor,




In [18]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./minicpm_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Smaller batch size
    gradient_accumulation_steps=8,  # Accumulate gradients
    fp16=True,  # Mixed precision training
    save_steps=10_000,
    save_total_limit=2,
)

# Configure LoRA
peft_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target modules to apply LoRA
    lora_dropout=0.1,  # Dropout for LoRA
    bias="none",  # Bias setting
    task_type="CAUSAL_LM"  # Task type
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Clear CUDA cache
torch.cuda.empty_cache()

# Trainer to train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
)

# Fine-tune the MiniCPM model
trainer.train()

# Save the model weights after training
model.save_pretrained("./minicpm_finetuned")

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


TypeError: MiniCPMV.forward() missing 1 required positional argument: 'data'

In [5]:
# Split the dataset into training and validation sets (if needed)
split_datasets = tokenized_dataset.train_test_split(test_size=0.1)

# Ensure split datasets are non-empty
print(f"Training set length: {len(split_datasets['train'])}")
print(f"Validation set length: {len(split_datasets['test'])}")

Training set length: 569
Validation set length: 64


In [6]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./minicpm_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Smaller batch size
    gradient_accumulation_steps=8,  # Accumulate gradients
    fp16=True,  # Mixed precision training
    save_steps=10_000,
    save_total_limit=2,
)

In [7]:
# Configure LoRA
peft_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target modules to apply LoRA
    lora_dropout=0.1,  # Dropout for LoRA
    bias="none",  # Bias setting
    task_type="CAUSAL_LM"  # Task type
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

In [8]:
# Trainer to train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
# Fine-tune the MiniCPM model
trainer.train()

# Save the model weights after training
model.save_pretrained("./minicpm_finetuned")

  return table.fast_gather(key % table.num_rows)


IndexError: list index out of range

In [4]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model

# Step 1: Load and Parse JSON Data
def load_conversations(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    conversations = data['conversations']
    return conversations

def prepare_data(conversations):
    inputs = []
    outputs = []
    current_input = ""
    for entry in conversations:
        if entry['from'] == 'human':
            current_input += entry['value'] + " "
        elif entry['from'] == 'gpt':
            inputs.append(current_input.strip())
            outputs.append(entry['value'])
            current_input = ""
    return inputs, outputs

# Step 2: Fine-Tuning the Model with LoRA
def fine_tune_model_with_lora(inputs, outputs, model_name, output_dir, epochs=3):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    # Configure LoRA
    lora_config = LoraConfig(
        r=8,  # Rank of the low-rank adaptation matrices
        lora_alpha=16,  # Scaling factor
        lora_dropout=0.1,  # Dropout probability
        target_modules=['q_proj', 'v_proj']  # Modules to apply LoRA to
    )
    
    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)

    # Tokenize the dataset
    def tokenize_function(examples):
        inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)
        outputs = tokenizer(examples['output_text'], padding='max_length', truncation=True, max_length=512)
        labels = outputs['input_ids']
        inputs['labels'] = labels
        return inputs

    # Prepare dataset
    dataset = Dataset.from_dict({'input_text': inputs, 'output_text': outputs})
    print(f"Dataset size: {len(dataset)}")  # Debugging: Check dataset size

    # Check individual tokenization
    for i in range(5):
        print(f"Original input: {inputs[i]}")
        print(f"Tokenized input: {tokenizer(inputs[i], padding='max_length', truncation=True, max_length=512)['input_ids']}")
        print(f"Original output: {outputs[i]}")
        print(f"Tokenized output: {tokenizer(outputs[i], padding='max_length', truncation=True, max_length=512)['input_ids']}")

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "output_text"])
    print(f"Tokenized dataset size: {len(tokenized_datasets)}")  # Debugging: Check tokenized dataset size
    print(f"Sample tokenized input: {tokenized_datasets[0]['input_ids'][:10]}")  # Debugging: Print sample tokenized input
    print(f"Sample tokenized output: {tokenized_datasets[0]['labels'][:10]}")  # Debugging: Print sample tokenized output

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Load and parse the data
conversations = load_conversations('jacksparrow_modified.json')
inputs, outputs = prepare_data(conversations)

# Check the inputs and outputs
print(f"Inputs: {inputs[:5]}")  # Debugging: Print first 5 inputs
print(f"Outputs: {outputs[:5]}")  # Debugging: Print first 5 outputs

# Fine-tune the model
fine_tune_model_with_lora(inputs, outputs, 'openbmb/MiniCPM-Llama3-V-2_5-int4', './fine_tuned_model_lora')


Inputs: ["The top of a billowing sail passes regally in front of them. On the landward face of the sail, apparently high in the rigging, is a man for whom the term 'swashbuckling rogue' was coined: Captain JACK SPARROW. He gazes keen-eyed at the display as they pass. Raises a tankard in salute. Suddenly, something below catches his attention. He jumps from the rigging - -- and that's when we see that his is ship is not an imposing three-master, but just a small fishing dory with a single sail, plowing through the water -- the Jolly Mon. And it leaks. Which is why he has the tankard: to bail. Jack steps back to the tiller, and using a single sheet to control the sail, and the Jolly Mon comes around the promontory, the whole of Port Royal laid out before him. The huge British dreadnought, H.M.S. Dauntless dominates the bay. But Jack's attention is on a different ship: the H.M.S. Interceptor, a small sleek vessel with rail guns and a mortar in the middle of the main deck. It is tied up at

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]


Dataset size: 369
Original input: The top of a billowing sail passes regally in front of them. On the landward face of the sail, apparently high in the rigging, is a man for whom the term 'swashbuckling rogue' was coined: Captain JACK SPARROW. He gazes keen-eyed at the display as they pass. Raises a tankard in salute. Suddenly, something below catches his attention. He jumps from the rigging - -- and that's when we see that his is ship is not an imposing three-master, but just a small fishing dory with a single sail, plowing through the water -- the Jolly Mon. And it leaks. Which is why he has the tankard: to bail. Jack steps back to the tiller, and using a single sheet to control the sail, and the Jolly Mon comes around the promontory, the whole of Port Royal laid out before him. The huge British dreadnought, H.M.S. Dauntless dominates the bay. But Jack's attention is on a different ship: the H.M.S. Interceptor, a small sleek vessel with rail guns and a mortar in the middle of the mai

Map: 100%|██████████| 369/369 [00:00<00:00, 3386.60 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Tokenized dataset size: 369
Sample tokenized input: [791, 1948, 315, 264, 4121, 24060, 30503, 16609, 1239, 750]
Sample tokenized output: [2181, 596, 23649, 1268, 3629, 1884, 1403, 25022, 72359, 13]


IndexError: Invalid key: 307 is out of bounds for size 0