In [1]:
import torch
from tqdm.auto import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


# 1 Load datasets

In [2]:
import sklearn
import numpy as np
with open('./data.txt', 'rb') as f:
    lines = [line.strip().lower().decode("ascii", "ignore") for line in f if len(line.strip()) > 0]
ds_train, ds_valid = sklearn.model_selection.train_test_split(lines, test_size=0.3, random_state=555)

In [3]:
from datasets import Dataset
from datasets import DatasetDict
train = Dataset.from_dict({'text': ds_train})
valid = Dataset.from_dict({'text': ds_valid})
raw_datasets = DatasetDict({'train':train,'valid':valid})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1862
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 799
    })
})

In [4]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

TEXT: "i think i could jump over it," said the cowardly lion, after measuring


# 2 Preprocessing

In [5]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "right" # "left" or "right"
tokenizer.pad_token = tokenizer.eos_token

outputs = tokenizer(
    raw_datasets["train"][:2]["text"], 
    return_tensors="pt",
    padding=True,
    )
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(outputs['input_ids'])

Input IDs length: 2
tensor([[    1,    72,   892,  1312,   714,  4391,   625,   340,   553,   531,
           262, 47687, 18744,    11,   706, 15964],
        [  568,   484,  1043,   257, 37438,  1295,   739,   262,  7150,   810,
           484, 21256,   880,  1566, 50256, 50256]])


In [6]:
def tokenize(element):
    outputs = tokenizer(
        element["text"], 
        return_tensors="pt",
        padding=True,
    )
    
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}

# raw_datasets_ = Dataset.from_pandas(pd.DataFrame(data=raw_datasets_train))
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/1862 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1862
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 799
    })
})

# 3 Model

In [7]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
context_length  = 128
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [8]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [9]:
keytoken_ids = []
for keyword in [
    "will",
    "long",
    "How",
    "diplomatic officials",
    "Napoleon",
    "Alexander ",
    "France",
    "Lázarev"
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: diplomatic officials
Keyword has not single token: Napoleon
Keyword has not single token: Alexander 
Keyword has not single token: Lázarev


In [10]:
from torch.nn import CrossEntropyLoss
import torch

def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False) #change to reduction=None
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [11]:
from torch.utils.data.dataloader import DataLoader
batch_size = 128
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets["valid"], batch_size=batch_size)

In [12]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [13]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():

            outputs = model(batch["input_ids"].to(device), labels=batch["input_ids"].to(device))
            outputs.loss = outputs.loss.reshape(1)
        losses.append(accelerator.gather(outputs.loss))        
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [14]:
model = GPT2LMHeadModel(config)
model = model.to(device)
from torch.optim import AdamW
optimizer = AdamW(get_grouped_params(model), lr=5e-4)

### Accelerator

In [15]:
from accelerate import Accelerator

accelerator = Accelerator(mixed_precision='fp16')

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [16]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

### Repository

In [17]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\anhng\.cache\huggingface\token
Login successful


In [18]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "Lab9"
repo_name = get_full_repo_name(model_name)
repo_name

'Anh1008AIT/Lab9'

In [19]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

output_dir = "Lab9"
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/Anh1008AIT/Lab9 into local empty directory.


# 4 Training

In [20]:
evaluate()

(10.66565990447998, 42858.52734375)

In [21]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 10
eval_steps = 10

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(enumerate(train_dataloader, start=1), total=num_training_steps):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)

        if step % 10 == 0:
            accelerator.print(
                {
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        # print(loss)
        accelerator.backward(loss) #instance of optimize.backward()

        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            #save your model
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/75 [00:00<?, ?it/s]



{'steps': 0, 'loss/train': 106.883544921875}


  0%|          | 0/75 [00:00<?, ?it/s]

{'steps': 1, 'loss/train': 106.6634750366211}


  0%|          | 0/75 [00:00<?, ?it/s]

{'steps': 2, 'loss/train': 106.60737991333008}


  0%|          | 0/75 [00:00<?, ?it/s]

{'steps': 3, 'loss/train': 103.3603286743164}


  0%|          | 0/75 [00:00<?, ?it/s]

{'steps': 4, 'loss/train': 99.4028091430664}


In [22]:
step % gradient_accumulation_steps == 0

False

In [23]:
tokenizer.save_pretrained(output_dir)

('Lab9\\tokenizer_config.json',
 'Lab9\\special_tokens_map.json',
 'Lab9\\vocab.json',
 'Lab9\\merges.txt',
 'Lab9\\added_tokens.json')

In [24]:
model.train()
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [25]:
repo.push_to_hub(commit_message=f"Training in progress step {step}", blocking=False)

('https://huggingface.co/Anh1008AIT/Lab9/commit/e979ab8669c07635d7565210ee9d1b348d6a8afc',
 [push command, status code: running, in progress. PID: 25128])

In [26]:
import torch
from transformers import pipeline
checkpoints = "Lab9"
pipe = pipeline("text-generation", max_length=30, pad_token_id=0, model=checkpoints) 

In [27]:
txt = "You are"
pipe(txt)[0]["generated_text"]



'You areHunter she flavour bird largely StreetsBack slicing Location terribly terribly annexlehemresp Infinite pee Spa volumes dinosaurs Supports Detaililles Drag uselessions sd overlooking Citizenship'

In [28]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(checkpoints, pad_token_id=tokenizer.eos_token_id)

In [29]:
input_ids = tokenizer.encode('It was', return_tensors='pt')

# 5 Test 

In [30]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x1f3efd4cbb0>

### Greedy

In [31]:
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
It was ​ ​ ​ ​ thief thief thief thief thief thief thief thief bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird bird


### Beam search

In [32]:
beam_output = model.generate(
    input_ids,  
    max_length=50, 
    num_beams=2, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
It was Sev Sev


In [33]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: It was
1: It was Sev Sev
2: It was hiber hiber
3: It was Sev Sev ceiling ceiling
4: It was hiber hiberIASIAS


### Sampling

In [34]:
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
It wasProtrational pilot Inherave Girls countrysidedigitalbackstree ceremon bipartisan Zionistgd Innov shameful unfortunately Cath condem wrestlerrehensaband regulatesPlusski decreases she soBO82 amput MTVdy sailfbirmingines brood Sanskrit breweries signedESSION carefully`. SexualNaturalyer cheesy


### Top-p (nucleus) sampling

In [35]:
# set seed to reproduce results. Feel free to change the seed though to get different results
torch.manual_seed(0)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: It was ceilingdoctoral Dism896 Peck shock acts actsaways Ald¨ valley convenient act Jose bomber IOC chemist Accordingly sprinkle DESaways unrem barley Hunter Fold defendant Smooth realmsysc Iraqirypt
1: It was
2: It wasFriendsAL dens shock remote NPCs Cy optimized jurisdictionsho Zer Hale JPMorganJECT 1977Moscowology
