In [1]:
import torch
import os
import random
import json
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))
torch.cuda.empty_cache()

True
0
NVIDIA GeForce RTX 3070


In [7]:
def generate_nback_sequences(n, alphabet, seq_length, matches, num_sequences):
    sequences = []
    for _ in range(num_sequences):
        seq, conditions = [], []
        match_positions = random.sample(range(n, seq_length), matches)
        match_positions.sort()

        for i in range(seq_length):
            if i in match_positions:
                seq.append(seq[i - n])
                conditions.append('m')
            else:
                random_letter = random.choice(alphabet)
                while i > n and seq[i - n] == random_letter:
                    random_letter = random.choice(alphabet)
                seq.append(random_letter)
                conditions.append('-')
        
        sequences.append((''.join(seq), ''.join(conditions)))

    return sequences

def save_sequences_with_prompts(folder, sequences, split, n):
    os.makedirs(folder, exist_ok=True)
    split_folder = os.path.join(folder, split)
    os.makedirs(split_folder, exist_ok=True)
    data = []

    for i, (seq, conditions) in enumerate(sequences):
        system_prompt = {"role": "system", "content" : f"You are asked to perform a {n}-back task. You will see a sequence of letters. Your task is to respond with 'm' (no quotation marks, just the letter m) whenever the current letter is the same as the previous one, and '-' (no quotation marks, just the dash sign) otherwise. Only 'm' and '-' are allowed responses. No explanations needed: please don't output any extra words!! The sequence will be presented one letter at a time. Now begins the task."}
        for j in range(n, len(seq)):
            user_input = {"role": "user", "content": seq[:j+1]}
            expected_output = {"role": "assistant", "content": conditions[j]}
            data.append({
                "messages": [system_prompt, user_input],
                "completion": expected_output
            })

    with open(os.path.join(split_folder, f"{n}back_{split}.json"), "w") as f:
        json.dump(data, f)

alphabet = 'bcdfghjklnpqrstvwxyz'
seq_length = 24
matches = 8
num_sequences_train = 35
num_sequences_val = 7
num_sequences_test = 8

for n in [1, 2, 3]:
    sequences_train = generate_nback_sequences(n, alphabet, seq_length, matches, num_sequences_train)
    sequences_val = generate_nback_sequences(n, alphabet, seq_length, matches, num_sequences_val)
    sequences_test = generate_nback_sequences(n, alphabet, seq_length, matches, num_sequences_test)
    folder = os.path.join("datasets", "letters")
    save_sequences_with_prompts(folder, sequences_train, 'train', n)
    save_sequences_with_prompts(folder, sequences_val, 'validation', n)
    save_sequences_with_prompts(folder, sequences_test, 'test', n)


In [2]:
from datasets import load_dataset

# Load dataset from JSON files
dataset = load_dataset('json', data_files={
    'train': ['datasets/letters/train/1back_train.json', 'datasets/letters/train/2back_train.json', 'datasets/letters/train/3back_train.json'],
    'validation': ['datasets/letters/validation/1back_validation.json', 'datasets/letters/validation/2back_validation.json', 'datasets/letters/validation/3back_validation.json'],
    'test': ['datasets/letters/test/1back_test.json', 'datasets/letters/test/2back_test.json', 'datasets/letters/test/3back_test.json']
})


In [3]:
print(dataset['train'][0])

{'messages': [{'content': "You are asked to perform a 1-back task. You will see a sequence of letters. Your task is to respond with 'm' (no quotation marks, just the letter m) whenever the current letter is the same as the previous one, and '-' (no quotation marks, just the dash sign) otherwise. Only 'm' and '-' are allowed responses. No explanations needed: please don't output any extra words!! The sequence will be presented one letter at a time. Now begins the task.", 'role': 'system'}, {'content': 'ky', 'role': 'user'}], 'completion': {'content': '-', 'role': 'assistant'}}


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token
# Define preprocess function
def preprocess_function(examples):
    inputs = []
    labels = []
    
    for messages, completion in zip(examples["messages"], examples["completion"]):
        input_text = " ".join([message["content"] for message in messages])
        target_text = completion["content"]
        
        inputs.append(input_text)
        labels.append(target_text)
        
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)  # Reduce max length
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, padding="max_length", truncation=True, max_length=128).input_ids  # Reduce max length
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocess function
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2310 [00:00<?, ? examples/s]



Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

In [7]:
# Set up training arguments and trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,   # Reduce batch size
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    gradient_checkpointing=True,  # Enable gradient checkpointing
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
)

# Clear CUDA cache
torch.cuda.empty_cache()

# Fine-tune the model
trainer.train()


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

In [None]:
# Function to test the model
def test_model(model, tokenizer, test_dataset):
    model.eval()
    correct = 0
    total = 0

    for example in test_dataset:
        system_prompt = example["messages"][0]["content"]
        user_input = example["messages"][1]["content"]
        inputs = tokenizer(system_prompt + user_input, return_tensors='pt').to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        
        predicted_char = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        if predicted_char == example["completion"]["content"]:
            correct += 1
        total += 1
    
    accuracy = correct / total
    return accuracy

# Load test data and evaluate
test_dataset = dataset['test']
accuracy = test_model(model, tokenizer, test_dataset)
print(f"Accuracy: {accuracy:.2f}")