In [2]:
import os
from datasets import load_dataset

# Load Tiny Shakespeare dataset from the URL
dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
dataset_path = "./data/tinyshakespeare/tiny_shakespeare.txt"

if not os.path.exists(dataset_path):
    print("Downloading Tiny Shakespeare dataset...")
    import requests
    response = requests.get(dataset_url)
    with open(dataset_path, "wb") as f:
        f.write(response.content)

# Load the dataset into a Hugging Face Dataset format
dataset = load_dataset("text", data_files={"train": dataset_path})

# Show a few examples from the dataset
print(dataset["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

{'text': 'First Citizen:'}


In [22]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from peft import get_peft_model, LoraConfig, TaskType

# Load pretrained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name,
bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained(model_name)


In [23]:

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Language model task
    inference_mode=False,
    r=8,  # Rank of the low-rank adapters
    lora_alpha=32,
    lora_dropout=0.1
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)




In [9]:
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [33]:
tokenizer(":")

{'input_ids': [25], 'attention_mask': [1]}

In [36]:
from transformers import Trainer, TrainingArguments

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_dataset = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [27]:
dataset["train"][0]

{'text': 'First Citizen:'}

In [18]:
len(tokenized_dataset["input_ids"])

40000

In [37]:
tokenized_dataset["input_ids"][0]

[5962, 22307, 25]

In [40]:
tokenized_dataset['input_ids']

[[5962, 22307, 25],
 [8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13],
 [],
 [3237, 25],
 [5248, 461, 11, 2740, 13],
 [],
 [5962, 22307, 25],
 [1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30],
 [],
 [3237, 25],
 [4965, 5634, 13, 12939, 13],
 [],
 [5962, 22307, 25],
 [5962,
  11,
  345,
  760,
  327,
  1872,
  385,
  1526,
  28599,
  318,
  4039,
  4472,
  284,
  262,
  661,
  13],
 [],
 [3237, 25],
 [1135, 760, 470, 11, 356, 760, 470, 13],
 [],
 [5962, 22307, 25],
 [5756,
  514,
  1494,
  683,
  11,
  290,
  356,
  1183,
  423,
  11676,
  379,
  674,
  898,
  2756,
  13],
 [3792, 470, 257, 15593, 30],
 [],
 [3237, 25],
 [2949, 517, 3375, 319, 470, 26, 1309, 340, 307, 1760, 25, 1497, 11, 1497, 0],
 [],
 [12211, 22307, 25],
 [3198, 1573, 11, 922, 4290, 13],
 [],
 [5962, 22307, 25],
 [1135, 389, 17830, 3595, 4290, 11, 262, 1458, 1173, 1547, 922, 13],
 [2061, 4934, 969, 5036, 896, 319, 561, 26958, 514, 25, 611, 484],
 [19188, 7800, 514, 475, 262, 48713, 414, 11, 981, 

In [38]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-tinyshakespeare-lora",
    # evaluation_strategy="steps",
    # per_device_train_batch_size=4,
    # per_device_eval_batch_size=4,
    num_train_epochs=3,
    # logging_steps=500,
    # save_steps=1000,
    # save_total_limit=2,
    # logging_dir="./logs",
    # fp16=True,  # Mixed precision training
    # push_to_hub=False,  # Set to True if you want to push to Hugging Face Model Hub
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Start finetuning
trainer.train()




  0%|          | 0/30000 [00:00<?, ?it/s]

IndexError: index out of range in self