In [33]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
import math
from transformers import BertTokenizer, GPT2LMHeadModel
import numpy as np

In [34]:
class Chinese_Medical_DS(Dataset):
    def __init__(self, path, tokenizer, max_len=1024):
        self.path = path
        sentence = []
        with open(self.path, 'r', encoding='utf-8') as f:
            for line in f:
                sen_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line.strip()))
                full_sen = []
                full_sen.append(tokenizer.convert_tokens_to_ids('[MASK]'))
                full_sen.extend(sen_ids)
                full_sen.append(tokenizer.convert_tokens_to_ids('[CLS]'))
                if len(full_sen) <= max_len:
                    sentence.append(full_sen)
        self.data = sentence
        
    # need to overload
    def __len__(self):
        return len(self.data)

    # need to overload
    def __getitem__(self, idx):
        input = self.data[idx]
        target = input
        return input, target

def calculate_ppl_dataset(test_dataloader, model):
    all_valid_ppl = 0.0
    device = model.device
    valid_pbar = tqdm(test_dataloader)
    valid_pbar.set_description('valid ppl')
    for step, (input, label) in enumerate(valid_pbar):
        with torch.no_grad():
            
            input_ids = torch.tensor(label).long().to(device).unsqueeze(0)
            label_ids = torch.tensor(input).long().to(device).unsqueeze(0)
            #  forward pass
            output = model(input_ids=input_ids, labels=label_ids)
            loss = output.loss
            # print('input_ids.shape[-1] = ', input_ids.shape[-1])
            ppl_step = torch.exp(loss)
            all_valid_ppl += ppl_step
            valid_pbar.set_postfix({'loss': '{:.7f}'.format(loss), 'ppl': '{:.7f}'.format(ppl_step)})

    print('test step = {}'.format(step))
    all_valid_ppl = all_valid_ppl / (step + 1)

    print('mean ppl = ', all_valid_ppl)
    
    return step, all_valid_ppl

# GPT2Chinese

In [35]:
model_path = "..\\Raw_GPT2\\"
tok_path = '..\\Raw_GPT2\\vocab.txt'
tokenizer = BertTokenizer(vocab_file=tok_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:{}'.format(device))
model.eval()
model.to(device)

using device:cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [36]:
test_dataset = Chinese_Medical_DS("..\\Data\\tiny_test.txt", tokenizer)
test_dataloader = DataLoader(dataset=test_dataset, shuffle=False)
print("len(valid_dataloader) = {}".format(len(test_dataloader)))

len(valid_dataloader) = 1500


In [37]:
calculate_ppl_dataset(test_dataloader, model)

valid ppl: 100%|██████████| 1500/1500 [00:29<00:00, 50.91it/s, loss=2.9046230, ppl=18.2583599]

test step = 1499
mean ppl =  tensor(16.9775, device='cuda:0')





(1499, tensor(16.9775, device='cuda:0'))

# CFT GPT2Chinese

In [38]:
model_path = "..\\Tiny_CFT_GPT2\\model\\best_model\\"
tok_path = '..\\Raw_GPT2\\vocab.txt'
tokenizer = BertTokenizer(vocab_file=tok_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:{}'.format(device))
model.eval()
model.to(device)

using device:cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [39]:
calculate_ppl_dataset(test_dataloader, model)

valid ppl: 100%|██████████| 1500/1500 [00:29<00:00, 50.40it/s, loss=1.9646586, ppl=7.1324778] 

test step = 1499
mean ppl =  tensor(7.9693, device='cuda:0')





(1499, tensor(7.9693, device='cuda:0'))

# RAW English GPT2

In [42]:
# model_path = "..\\Tiny_CFT_GPT2\\model\\best_model\\"
tok_path = '..\\gpt2\\vocab.json'
tokenizer = BertTokenizer(vocab_file=tok_path)
model = GPT2LMHeadModel.from_pretrained('..\\gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:{}'.format(device))
model.eval()
model.to(device)

using device:cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [43]:
calculate_ppl_dataset(test_dataloader, model)

valid ppl: 100%|██████████| 1500/1500 [00:32<00:00, 46.17it/s, loss=9.4429989, ppl=12619.5048828]  


test step = 1499
mean ppl =  tensor(17880.8477, device='cuda:0')


(1499, tensor(17880.8477, device='cuda:0'))