In [None]:
!pip install torch torchvision torchaudio accelerate transformers
!pip install transformers accelerate datasets -U

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from datasets import load_dataset
from huggingface_hub import login
from huggingface_hub import Repository
import torch

token = "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"
login(token = token)

In [3]:
# Load the dataset
data_files = {"train": "1337_school_training.csv", "validation": "1337_school_validation.csv"}
dataset = load_dataset('csv', data_files=data_files)


In [None]:
# Load the tokenizer and model
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name)

In [5]:
# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=3000)
    outputs = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=3000)
    inputs["labels"] = outputs["input_ids"]
    return inputs

In [6]:
model.resize_token_embeddings(len(tokenizer))
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Check CUDA availability
print(torch.cuda.is_available())

device = torch.device("cpu")
model.to(device)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    no_cuda=True
)


In [9]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Fine-tuning
trainer.train()

In [None]:
# Save the fine-tuned model
trainer.save_model("./results/fine-tuned-model")

# repo = Repository(local_dir="fine-tuned-model-dir", token=token)
# repo.create_repo(name="1337bot", exist_ok=True)
# repo.push_to_hub()