In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
pip install torch transformers datasets peft accelerate scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

In [4]:
# LOAD AND STURCTURE DATA
data = load_dataset("Abirate/english_quotes")


def merge_columns(entry):
    entry["prediction"] = entry["quote"] + " ->: " + str(entry["tags"])
    return entry


data['train'] = data['train'].map(merge_columns)
print(data['train']['prediction'][:5])

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']", "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']", "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']", "“So many books, so little time.” ->: ['books', 'humor']", "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
print(data["train"][0])



Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

{'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator'], 'prediction': "“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']", 'input_ids': [447, 250, 3856, 3511, 26, 2506, 2073, 318, 1541, 2077, 13, 447, 251, 4613, 25, 37250, 1350, 12, 14108, 944, 3256, 705, 37718, 4835, 12, 525, 260, 8704, 3256, 705, 24130, 9673, 3256, 705, 1040, 4063, 864, 3256, 705, 25413, 1078, 6169, 12, 418, 7718, 12, 21992, 68, 3256, 705, 22708, 12, 24859, 23823, 20520], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    device_map='auto',
)

# FREEZE WEIGHTS
for param in model.parameters():
    param.requires_grad = False

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# LoRa
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)




In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


print_trainable_parameters(model)



trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863


In [9]:
# TRAINING
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=500,
        learning_rate=2e-4,
        num_train_epochs=3,
        logging_steps=1,
        output_dir='outputs',
        auto_find_batch_size=True
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
trainer.train()

torch.save(model.state_dict(), 'lora.pt')

max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mharicharan6991[0m ([33mharicharan6991-self[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113765922224654, max=1.0…

Step,Training Loss
1,4.6523
2,4.4936
3,4.2462
4,4.4126
5,4.1223
6,4.4607
7,4.5576
8,4.5646
9,4.1746
10,4.5192


In [10]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    device_map='auto',
)

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model = model.to(device)
model.load_state_dict(torch.load("lora.pt", map_location=device))

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

with torch.no_grad():
    batch = tokenizer("“Life is like a box of chocolates, you never know what you are gonna get” ->: ", return_tensors='pt').to(device)
    output_tokens = model.generate(**batch, max_new_tokens=25)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

  model.load_state_dict(torch.load("lora.pt", map_location=device))
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




 “Life is like a box of chocolates, you never know what you are gonna get” ->: vernacular, love, life, love-inspirational-life, love-inspirational-life-inspir
