In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DefaultDataCollator
from datasets import Dataset, load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [4]:

print(torch.cuda.get_device_name(0))
print('Memory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NVIDIA GeForce RTX 4090
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
print(device)

cuda:0


In [6]:
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [8]:
dataset = load_dataset('csv', data_files=r'C:\Users\adrianhf\Documents\test\Master\data\synthetic_data\question_abstract_pair.csv', split="train[:10]")

In [9]:
dataset = dataset.train_test_split(test_size=0.2)

In [10]:
dataset=dataset.flatten()

In [11]:
def preprocess_function(examples):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian")
    return tokenizer([" ".join(x) for x in examples["Abstract"]])

In [12]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

Map (num_proc=4): 100%|██████████| 8/8 [00:06<00:00,  1.29 examples/s]
num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.
Map (num_proc=2): 100%|██████████| 2/2 [00:03<00:00,  1.71s/ examples]


In [13]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4): 100%|██████████| 8/8 [00:01<00:00,  5.05 examples/s]
num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.
Map (num_proc=2): 100%|██████████| 2/2 [00:01<00:00,  1.32 examples/s]


In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")

In [16]:
## Load the transformers model
model = AutoModelForCausalLM.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian")

config.json: 100%|██████████| 634/634 [00:00<00:00, 1.89MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
pytorch_model.bin.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 23.8MB/s]
pytorch_model-00001-of-00002.bin: 100%|██████████| 9.98G/9.98G [15:53<00:00, 10.5MB/s]
pytorch_model-00002-of-00002.bin: 100%|██████████| 3.50G/3.50G [05:34<00:00, 10.5MB/s]
Downloading shards: 100%|██████████| 2/2 [21:28<00:00, 644.37s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.44s/it]
generation_config.json: 100%|██████████| 183/183 [00:00<?, ?B/s] 


In [17]:
data_collator = DefaultDataCollator()

In [18]:
lm_dataset["test"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 468
})

In [19]:
## Load the training_args and the trainer
training_args = TrainingArguments(
    output_dir="Master/trained_models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [20]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Of the allocated memory 53.84 GiB is allocated by PyTorch, and 69.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)