In [None]:
! pip install --upgrade huggingface_hub datasets accelerate transformers trl peft bitsandbytes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Load Dataset

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("stanfordnlp/imdb", split='train')

In [None]:
dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

# Tokenizer

In [None]:
from transformers import GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Model

In [None]:
from transformers import GPT2Config, GPT2Model, GPT2LMHeadModel, AutoModelForCausalLM

2025-01-05 17:04:43.982097: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-05 17:04:44.008910: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model = GPT2LMHeadModel(GPT2Config()) # empty model

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
from transformers import pipeline, set_seed

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)

Device set to use cuda:0


In [None]:
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model,azorazorazorigoigoigoigo Animal Animal Animal Animal Animal Animal Animal Animal Reptmultmult Vitalandowskiandowskiandowski"}]

# Trainer

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="llms",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    eval_steps=1000,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
    #max_steps=1000
    #push_to_hub=True,
)

## Prepare Dataset

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForLanguageModeling


def encode(examples):
    result = tokenizer(examples['text'], truncation=True, padding='max_length')
    result["labels"] = result["input_ids"].copy()
    return result

tokenizer.pad_token = tokenizer.eos_token
dataset = dataset.map(encode, batched=True, remove_columns=['label'])



#tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer( # fine tuning on the review language
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    processing_class=tokenizer
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,3.5754,1.738391


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1562, training_loss=3.7601075117475093, metrics={'train_runtime': 2915.5676, 'train_samples_per_second': 8.575, 'train_steps_per_second': 0.536, 'total_flos': 1.3060420927488e+16, 'train_loss': 3.7601075117475093, 'epoch': 0.99968})

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I have been a movie, I have been a movie. I have been a movie. I was a movie"}]

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# Unsupervised Fine Tuning

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset

model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
dataset = load_dataset("stanfordnlp/imdb", split='train')


def encode(examples):
    result = tokenizer(examples['text'], truncation=True, padding='max_length')
    result["labels"] = result["input_ids"].copy()
    return result

tokenizer.pad_token = tokenizer.eos_token
dataset = dataset.map(encode, batched=True, remove_columns=['label'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="llms",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    eval_steps=1000,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
    #max_steps=1000
    #push_to_hub=True,
)

trainer = Trainer( # fine tuning on the review language
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    processing_class=tokenizer
)

2025-01-11 22:41:50.329594: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-11 22:41:50.356837: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': "Hello, I'm a language model, language model. You know what I mean? A language model.\n\nWhen I write, I'm like"}]

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,2.0985,1.004865


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1562, training_loss=2.1338635687638794, metrics={'train_runtime': 3002.4575, 'train_samples_per_second': 8.327, 'train_steps_per_second': 0.52, 'total_flos': 1.3060420927488e+16, 'train_loss': 2.1338635687638794, 'epoch': 0.99968})

In [None]:
generator("Hello, I'm a language model,", max_length=300, num_return_sequences=1)

[{'generated_text': 'Hello, I\'m a language model, and I enjoy speaking Latin as a child. My favorite films is Lucio Morricone\'s (2007, 2003, 2004) and the excellent animated short "The Man Who Wrote Our Mother\'s Heart."'}]

In [None]:
generator("I liked the movie because", max_length=300, num_return_sequences=1)

[{'generated_text': 'I liked the movie because the plot and writing were extremely good and good at capturing life throughout the movie. The scenes depicted in the film were beautiful. I could not resist watching these scenes. In one scene I am reading a newspaper that is supposed to sell newspapers -- a small town. I had a great time watching this because even though the plot is weak and the movie doesn\'t get much interest from the viewer, at least I have no regrets here. It was a great movie to watch during this rainy season. The way the characters were portrayed and their feelings were captured was great to watch. I could not have chosen a better word for this movie -- "beautiful." My review says "truly an enjoyable movie to watch." This movie is for all ages but especially for kids. Great in that it has enough humor to entertain a lot of adults.'}]

In [None]:
generator("I didn't like the movie because", max_length=300, num_return_sequences=1)

[{'generated_text': 'I didn\'t like the movie because, I knew it wasn\'t going to be funny at all and because it was so cliché with the actors. It was a little bit cheesy but no matter. Some jokes, like in the "Whoa, we all got it?!" video, are just annoying. "Worst case scenario, who\'s with you??? Who\'s behind you???" I think the only way to describe this movie was to say that it sucked but it was an entertaining movie.'}]

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# Supervised Fine Tuning

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id
from transformers import Trainer, TrainingArguments
from trl import SFTConfig, SFTTrainer


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.3643
1000,0.288
1500,0.2712
2000,0.2583
2500,0.2441
3000,0.2206
3500,0.2171
4000,0.2027
4500,0.2147
5000,0.2073


TrainOutput(global_step=7500, training_loss=0.21930495300292968, metrics={'train_runtime': 5155.3491, 'train_samples_per_second': 14.548, 'train_steps_per_second': 1.455, 'total_flos': 3.91945125888e+16, 'train_loss': 0.21930495300292968, 'epoch': 3.0})

In [None]:
def get_sentiment(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(-1).item()
    return "positive" if prediction == 1 else "negative"

sentence = "I loved this movie!"
print(get_sentiment(sentence))

positive


In [None]:
from transformers import AutoModelForCausalLM
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

dataset = load_dataset("stanfordnlp/imdb", split="train")
model = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = SFTConfig(output_dir="/tmp",
                          per_device_train_batch_size=2,
                          per_device_eval_batch_size=2,
                          num_train_epochs=2,)

trainer = SFTTrainer( # Fine Tune on specific task
    model,
    train_dataset=dataset,
    args=training_args,
)

trainer.train()

Step,Training Loss
500,3.7336
1000,3.6898
1500,3.6713
2000,3.6408
2500,3.6443
3000,3.6318
3500,3.5966
4000,3.6143
4500,3.6137
5000,3.5972


TrainOutput(global_step=25000, training_loss=3.535866337890625, metrics={'train_runtime': 2127.9239, 'train_samples_per_second': 23.497, 'train_steps_per_second': 11.749, 'total_flos': 1.0261445370624e+16, 'train_loss': 3.535866337890625, 'epoch': 2.0})

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)

Device set to use cuda:0


[{'generated_text': 'Hello, I\'m a language model, not a "professional". I don\'t know how "language models" work in real life, but this series'}]

In [None]:
generator("I liked the movie because", max_length=300, num_return_sequences=1)

[{'generated_text': "I liked the movie because of Richard Attenborough being a good role model for the actor. I liked the film because of the great character development. I wanted to like it because of the very good cast. I do think that every time I watch it is because of that acting. I really enjoyed the character development. Unfortunately I thought the movie was a little bit too slow. Most of the movie didn't say much and I did like the slow pace. I think the movie needs to get a little more serious. I am really going to wait to find out the reason this has been so long. I know that some movie movies have a good story you watch and some movies they don't. I just wanted to just like this movie and just wanted to like it. It is not the right movie. It has a bad cast that got their points. I found the movie to be a little too light for me. It was too slow. I think the movie should be improved. It is a bad movie that is too slow. I am also glad I bought this movie. I love movies. If yo

In [None]:
generator("I did not like the movie because", max_length=300, num_return_sequences=1)

[{'generated_text': "I did not like the movie because of the plot and the fact I was scared.<br /><br />I thought the movie could have been a better film but it was just very boring. If someone would ask me to remake an old movie, I would just say it was a waste of 15 minutes of my life.<br /><br />The only point the film did was to draw attention to the fact that one had a history of mental illness and had to face the fact that his condition made it difficult to pass. There are very few people who would rather be saved and not be seen trying to end the suffering of someone they love. All of that said, the movie was one of the most disappointing and poorly put together movies i have ever watched. I cannot imagine anyone watching it expecting much. <br /><br />Please, if you think the movie is so boring it's not good, i.e. the plot just does not belong on the DVD and just doesn't seem to fit into the movie. If you think the film's not too serious, please give it a shot. But if you are j

In [None]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()

# Paramter Efficient Fine Tuning

In [None]:
from peft import LoraConfig

model = AutoModelForCausalLM.from_pretrained("gpt2")


peft_config = LoraConfig(
        r=32,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        # tune the embedding layer and prediction head
        modules_to_save=["wte", "lm_head"]
    )


training_args = SFTConfig(output_dir="/tmp",
                          per_device_train_batch_size=2,
                          per_device_eval_batch_size=2,
                          num_train_epochs=2,)

trainer = SFTTrainer( # Fine Tune on specific task
    'gpt2',
    train_dataset=dataset,
    args=training_args,
    peft_config=peft_config
)

trainer.train()



Step,Training Loss
500,3.8394
1000,3.7815
1500,3.7589
2000,3.7252
2500,3.7203
3000,3.7118
3500,3.6709
4000,3.6922
4500,3.6932
5000,3.678


TrainOutput(global_step=25000, training_loss=3.6454811279296875, metrics={'train_runtime': 1760.786, 'train_samples_per_second': 28.396, 'train_steps_per_second': 14.198, 'total_flos': 1.5060280946973696e+16, 'train_loss': 3.6454811279296875, 'epoch': 2.0})

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': "Hello, I'm a language model, not a syntax.\n\nYou've done something great.\n\nWrite a simple type for string\n\n"}]

In [None]:
gc.collect()
torch.cuda.empty_cache()

# Instruction Fine Tuning

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token


def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template,
                                            tokenizer=tokenizer)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=SFTConfig(output_dir="/tmp", per_device_train_batch_size=1,
                          per_device_eval_batch_size=1,
                          num_train_epochs=1),
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

trainer.train()

2025-01-19 14:12:14.597449: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-19 14:12:14.745906: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Step,Training Loss
500,1.9942
1000,1.8854
1500,1.737
2000,1.6427
2500,1.7816
3000,1.6479
3500,1.6885
4000,1.5255
4500,1.4975
5000,1.5221


TrainOutput(global_step=20022, training_loss=1.481015111740123, metrics={'train_runtime': 680.4995, 'train_samples_per_second': 29.423, 'train_steps_per_second': 29.423, 'total_flos': 1091093264640000.0, 'train_loss': 1.481015111740123, 'epoch': 1.0})

In [None]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
#set_seed(0)
generator(" ### Question: How can you help me? ### Answer:", max_length=30, num_return_sequences=1)

Device set to use cuda:0


[{'generated_text': ' ### Question: How can you help me? ### Answer: You can:\n  A: If you have any questions, message me on Twitter,'}]

In [None]:
gc.collect()
torch.cuda.empty_cache()

# Alignment Tuning (DPO)

In [None]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

training_args = DPOConfig(output_dir="gpt2_dpo", logging_steps=10, max_length=1024, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=1)
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
trainer.train()

Step,Training Loss
10,0.6989
20,0.7609
30,0.6553
40,0.6856
50,0.6943
60,0.7439
70,0.6982
80,0.6914
90,0.7007
100,0.6713


TrainOutput(global_step=31068, training_loss=0.746306933720661, metrics={'train_runtime': 15398.6027, 'train_samples_per_second': 4.035, 'train_steps_per_second': 2.018, 'total_flos': 0.0, 'train_loss': 0.746306933720661, 'epoch': 1.0})

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)
chat_history = [
    {"role": "system", "content": "You are a chatbot"},
    {"role": "user", "content": "Help me?"},
]

outputs = generator(
    chat_history,
    max_new_tokens=52,
)

print(outputs[0]["generated_text"])

Device set to use cuda:0


[{'role': 'system', 'content': 'You are a chatbot'}, {'role': 'user', 'content': 'Help me?'}, {'role': 'assistant', 'content': 'Notes:\nThis command ignores message-level messages, making communication with external tools harder. To avoid having to deal with a bot (e.g. a chat client) with an error message, be aware that text-only commands are not supported; messages'}]


In [1]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()