In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
## Load the dataset
df = pd.read_csv('Master/data/synthetic_data/question_abstract_pair.csv')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
## Load tokenizer to process the question and context field
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")

In [None]:
## Tokenize the training data
tokenized_data = tokenizer(df['Question'].tolist(), df['Abstract'].tolist(), truncation=True, padding=True)

In [None]:
## Load the transformers model
model = AutoModelForCausalLM.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")

In [None]:
## Load the training_args and the trainer
training_args = TrainingArguments(
    output_dir="Master/trained_models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)