In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import time, os, datetime, random, re
import torch
import numpy as np
import gc
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from pathlib import Path
from transformers import GPT2Config, get_linear_schedule_with_warmup,  GPT2LMHeadModel, AdamW, GPT2Tokenizer, TextDataset
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
from sklearn.model_selection import train_test_split

epochs = 8
training_stats = []
valid_stats = []
best_valid_loss = float('inf')

In [None]:
def __len__(DataLoader):
    return len(DataLoader)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


def train(model, dataloader, optimizer, epoch):

    total_t0 = time.time()

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    train_total_loss = 0

    model.train()

    for step, batch in enumerate(dataloader):

        if step % 40 == 0 and not step == 0:

            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader)))

        x = batch[:, :-1].cuda()
        y = batch[:, 1:].cuda()

        optimizer.zero_grad()

        with autocast():

            logits = model(input_ids=x)[0]

            loss = criterion(logits.flatten(0, 1), y.flatten(0))

            train_total_loss += loss.item()


        scaler.scale(loss).backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        scaler.step(optimizer)

        scaler.update()

        scheduler.step()

    avg_train_loss = train_total_loss / len(dataloader)

    training_stats.append(
        {
            'Train Loss': avg_train_loss
        }
    )

    training_time = format_time(time.time() - total_t0)

    print("")
    print("summary results")
    print("epoch | trn loss | trn time ")
    print(f"{epoch+1:5d} | {avg_train_loss:.5f} | {training_time:}")

    return training_stats



def validating(model, dataloader, epoch):

    total_t0 = time.time()

    print("")
    print("Running Validation...")

    model.eval()

    total_valid_loss = 0


    for batch in dataloader:


        x = batch[:, :-1].cuda()
        y = batch[:, 1:].cuda()


        with torch.no_grad():

            logits = model(input_ids=x)[0]

            loss = criterion(logits.flatten(0, 1), y.flatten(0))

            total_valid_loss += loss.item()

    global avg_val_loss
    avg_val_loss = total_valid_loss / len(dataloader)

    valid_stats.append(
        {
            'Val Loss': avg_val_loss,
            'Val PPL.': np.exp(avg_val_loss)
        }
    )

    training_time = format_time(time.time() - total_t0)

    print("")
    print("summary results")
    print("epoch | val loss | val ppl | val time")
    print(f"{epoch+1:5d} | {avg_val_loss:.5f} | {np.exp(avg_val_loss):.3f} | {training_time:}")

    return valid_stats

In [None]:

if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
dev

'cuda:0'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

tokenizer.add_special_tokens({'bos_token': '<bos>',
                              'eos_token': '<eos>',
                              'pad_token': '<pad>'})
config = GPT2Config(
        vocab_size=tokenizer.vocab_size,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
torch.cuda.set_device(0)
model = GPT2LMHeadModel(config).cuda(0)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [None]:
all_set = TextDataset(tokenizer=tokenizer,
                            file_path='./train.txt',
                            block_size=1025)

trset




In [None]:
train_set, valid_set = train_test_split(all_set, test_size=0.15)
train_dataloader = DataLoader(dataset=train_set,
                                  sampler=SequentialSampler(train_set),
                                            batch_size=4,
                                            drop_last=True,
                                            shuffle=False)
valid_dataloader = DataLoader(dataset=valid_set,
                                  sampler=SequentialSampler(valid_set),
                                            batch_size=4,
                                            drop_last=True,
                                            shuffle=False)

trloader
validloader


In [None]:
scaler = GradScaler()

criterion = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(),
                      lr=2e-5)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)



In [None]:
torch.cuda.empty_cache()
gc.collect()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<512>"
for epoch in range(epochs):
  train(model, train_dataloader, optimizer, epoch)
  validating(model, valid_dataloader, epoch)
  if valid_stats[epoch]['Val Loss'] < best_valid_loss:
    best_valid_loss = valid_stats[epoch]['Val Loss']
    torch.save(model.state_dict(), 'gpt2-model1.pt')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained('./model_save/gpt2/')
    tokenizer.save_pretrained('./model_save/gpt2/') 


startuem

Training...
  Batch    40  of  25,762.
  Batch    80  of  25,762.
  Batch   120  of  25,762.
  Batch   160  of  25,762.
  Batch   200  of  25,762.
  Batch   240  of  25,762.
  Batch   280  of  25,762.
  Batch   320  of  25,762.
  Batch   360  of  25,762.
  Batch   400  of  25,762.
  Batch   440  of  25,762.
  Batch   480  of  25,762.
  Batch   520  of  25,762.
  Batch   560  of  25,762.
  Batch   600  of  25,762.
  Batch   640  of  25,762.
  Batch   680  of  25,762.
  Batch   720  of  25,762.
  Batch   760  of  25,762.
  Batch   800  of  25,762.
  Batch   840  of  25,762.
  Batch   880  of  25,762.
  Batch   920  of  25,762.
  Batch   960  of  25,762.
  Batch 1,000  of  25,762.
  Batch 1,040  of  25,762.
  Batch 1,080  of  25,762.
  Batch 1,120  of  25,762.
  Batch 1,160  of  25,762.
  Batch 1,200  of  25,762.
  Batch 1,240  of  25,762.
  Batch 1,280  of  25,762.
  Batch 1,320  of  25,762.
  Batch 1,360  of  25,762.
  Batch 1,400  of  25,762.
  Batch 1,440  of  25,762.
  Batc

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
if __name__ == "__main__":
    # set_seed(42)
    # generator = pipeline('text-generation', model="facebook/opt-1.3b", do_sample=True, device=0)
    text = open('test.txt', 'r').read()
    model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b", torch_dtype=torch.float16).cuda()

    # the fast tokenizer currently does not work correctly
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b", use_fast=False)

    prompt = 'Generate bigger .xes file: ' + text

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

    generated_ids = model.generate(input_ids)

    print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]