# ***Pretrained model***

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.1 MB/s[0m eta [36m0:00:0

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import sys
sys.path.append('drive/MyDrive/mlcs/minGPT/')
from model_min import GPT
from utils import set_seed
from bpe import BPETokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
use_mingpt = True
model_type = 'gpt2'

In [None]:
if use_mingpt:
  model = GPT.from_pretrained(model_type)
else:
  model = GPT2LMHeadModel.from_pretrained(model_type)
  model.config.pad_token_id = model.config.eos_token_id

model.to(device)
model.eval();

number of parameters:124.44M


In [None]:
def generate(prompt='',num_samples=10, steps=20, do_sample=True):
  if use_mingpt:
    tokenizer = BPETokenizer()  # init tokenizer string -> integer
    if prompt == '':
      x=torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
      x=tokenizer(prompt).to(device)  # input prompt to tokenizer
      #print(x.shape, x[0])  # (1,3)

  else:
    tokenizer = GPT2Tokenizer.from_pretrained(model_type)
    if prompt=='':
      prompt='<|endoftext|>'
    encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
    x=encoded_input['input_ids']

  x=x.expand(num_samples, -1)  # (10,3)
  y=model.generate(x,max_new_tokens=steps,do_sample=do_sample,top_k=40)  # (10,23)
  for i in range(num_samples):
    out = tokenizer.decode(y[i].cpu().squeeze())
    print('-'*80)
    print(out)


In [None]:
generate(prompt='dogs, the', num_samples=10, steps=20)

torch.Size([1, 3]) tensor([22242,    11,   262], device='cuda:0')
--------------------------------------------------------------------------------
dogs, the best people to be a part of it!


So do you want to join us for our
--------------------------------------------------------------------------------
dogs, the world's first self-made car company, and he's also the only white dude I've ever
--------------------------------------------------------------------------------
dogs, the man has a knack for doing things he didn't even know she wanted. "He could pull you
--------------------------------------------------------------------------------
dogs, the U.K.'s biggest clubs, will begin to prepare for the World Cup.

The
--------------------------------------------------------------------------------
dogs, the two men who came to the city to meet each other were charged with burglary in connection with the burglary
-----------------------------------------------------------------------

# ***sorting model***

In [None]:
# from demo
set_seed(3407)

In [None]:
import pickle

class SortDataset(Dataset):
  def __init__(self, split, length=6, num_digits=3):
    assert split in {'train','test'}
    self.split = split
    self.length = length
    self.num_digits = num_digits

  def __len__(self):
    return 10000

  def get_vocab_size(self):
    return self.num_digits

  def get_block_size(self):
    return self.length*2-1  # 11

  def __getitem__(self, idx):
    while True:
      # random (6,) shape int 0,1,2
      inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
      if torch.rand(1).item() < 0.5:
        if inp.unique().nelement() > self.length //2:
          continue  # resample for training harder with duplicating the unique number

      h=hash(pickle.dumps(inp.tolist()))  # for split train, test wt bite sys assign
      inp_split = 'test' if h%4 == 0 else 'train'
      if inp_split == self.split:
        break

    sol = torch.sort(inp)[0]

    cat=torch.cat((inp,sol), dim=0)  # 12

    x = cat[:-1].clone()  # 11 last drop
    y = cat[1:].clone()  # 11 first drop
    y[:self.length-1] = -1  # until 5, set -1
    return x, y



In [None]:
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[3]
for a,b in zip(x,y):
  print(int(a),int(b))

1 -1
1 -1
0 -1
2 -1
0 -1
1 0
0 0
0 1
1 1
1 1
1 2


AttributeError: ignored

In [None]:
demo_model_config = GPT.get_default_config()
demo_model_config.model_type = 'gpt-nano'
demo_model_config.vocab_size = train_dataset.get_vocab_size()
demo_model_config.block_size = train_dataset.get_block_size()
#demo_model_config.n_layer = None
#demo_model_config.n_head = None
#demo_model_config.n_embd = None
demo_model=GPT(demo_model_config)

number of parameters:0.09M


In [None]:
demo_train_config = Trainer.get_default_config()
demo_train_config.learning_rate = 5e-4
demo_train_config.max_iters = 2000
demo_train_config.num_workers = 0
trainer = Trainer(demo_train_config, model, train_dataset)

running on device cuda


In [None]:
def batch_end_callback(trainer):
  if trainer.iter_num % 100 ==  0:
    print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")

trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()


iter_dt 0.00ms; iter 0: train loss 9.09891
iter_dt 15.60ms; iter 100: train loss 3.58220
iter_dt 20.09ms; iter 200: train loss 0.72822
iter_dt 21.70ms; iter 300: train loss 0.17687
iter_dt 48.62ms; iter 400: train loss 0.05302
iter_dt 38.45ms; iter 500: train loss 0.05947
iter_dt 14.21ms; iter 600: train loss 0.08881
iter_dt 22.13ms; iter 700: train loss 0.02463
iter_dt 20.45ms; iter 800: train loss 0.04041
iter_dt 13.92ms; iter 900: train loss 0.03132
iter_dt 14.80ms; iter 1000: train loss 0.03327
iter_dt 13.56ms; iter 1100: train loss 0.00864
iter_dt 13.80ms; iter 1200: train loss 0.00175
iter_dt 13.94ms; iter 1300: train loss 0.00237
iter_dt 14.40ms; iter 1400: train loss 0.00994
iter_dt 14.18ms; iter 1500: train loss 0.02308
iter_dt 19.79ms; iter 1600: train loss 0.01035
iter_dt 20.12ms; iter 1700: train loss 0.00132
iter_dt 17.02ms; iter 1800: train loss 0.01138
iter_dt 13.76ms; iter 1900: train loss 0.01676


In [None]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(7852, 48)
    (wpe): Embedding(100, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=48, o

In [None]:
def eval_split(trainer, split, max_batches):
  dataset = {'train':train_dataset, 'test':test_dataset}[split]
  n=train_dataset.length  # 6
  results=[]
  mistakes_printed_already=0
  loader = DataLoader(dataset, batch_size=100, num_workers=4, drop_last=False)
  for b, (x,y) in enumerate(loader):
    x=x.to(trainer.device)  # (b_s, 11)
    y=y.to(trainer.device)  # (b_s, 11)
    inp = x[:,:n]  # choose first 6 (b_s, 6)
    sol = y[:, -n:]  # first 5 drop:-1 (b_s, 6)
    cat = model.generate(inp, n, do_sample=False)  # greedy argmax (b_s,12)
    #print(inp.shape, sol.shape, cat.shape)
    sol_candidate=cat[:,n:] # (b_s,6)
    correct= (sol==sol_candidate).all(1).cpu()
    for i in range(x.size(0)):
      results.append(int(correct[i]))
      if not correct[i] and mistakes_printed_already < 3:
        mistakes_printed_already += 1
        print(f'gpt claims {inp[i].tolist()} sorted is {sol_candidate[i].tolist()} but gt is {sol[i].tolist()} ')

    if max_batches is not None and b+1>max_batches:
      break

  rt = torch.tensor(results, dtype=torch.float)
  print(f"{split} final score: {rt.sum()} / {len(results)}={100*rt.mean()}% correct")
  return rt.sum()

with torch.no_grad():
  train_score = eval_split(trainer, 'train', max_batches=50)
  test_score = eval_split(trainer, 'test', max_batches=50)



train final score: 5100.0 / 5100=100.0% correct
test final score: 5100.0 / 5100=100.0% correct


In [None]:
n=train_dataset.length
inp = torch.tensor([[0,0,2,1,0,1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement()==n
with torch.no_grad():
  cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate=cat[:,n:]
print(f'input:{inp.tolist()}')
print(f'predicted sort:{sol_candidate.tolist()}')
print(f'gt:{sol.tolist()}')


input:[[0, 0, 2, 1, 0, 1]]
predicted sort:[[0, 0, 0, 1, 1, 2]]
gt:[0, 0, 0, 1, 1, 2]


# ***Custom data***

In [1]:
en_text = "A Dog Run back corner near spare bedrooms"

In [2]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [3]:
def tokenize(en_text):
    return [tok.text for tok in spacy_en.tokenizer(en_text)]
print(tokenize(en_text))

['A', 'Dog', 'Run', 'back', 'corner', 'near', 'spare', 'bedrooms']


In [4]:
!pip install nltk



In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from nltk.tokenize import word_tokenize
print(word_tokenize(en_text))

['A', 'Dog', 'Run', 'back', 'corner', 'near', 'spare', 'bedrooms']


In [7]:
!pip install konlpy
!pip install mecab-python
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0
Collecting mecab-python
  Downloading mecab-python-1.0.0.tar.gz (1.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mecab-python3 (from mecab-python)
  Downloading mecab_python3-1.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.6/581.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mecab-p

In [8]:
kor_text = "사과의 놀라운 효능이라는 글을 봤어. 그래서 오늘 사과를 먹으려고 했는데 사과가 썩어서 슈퍼에 가서 사과랑 오렌지 사왔어"

In [9]:
from konlpy.tag import Mecab
tokenizer = Mecab()
print(tokenizer.morphs(kor_text))

['사과', '의', '놀라운', '효능', '이', '라는', '글', '을', '봤', '어', '.', '그래서', '오늘', '사과', '를', '먹', '으려고', '했', '는데', '사과', '가', '썩', '어서', '슈퍼', '에', '가', '서', '사과', '랑', '오렌지', '사', '왔', '어']


In [10]:
import urllib.request
import pandas as pd
from konlpy.tag import Mecab
from nltk import FreqDist
import numpy as np
import matplotlib.pyplot as plt

In [11]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")
data = pd.read_table('ratings.txt') # 데이터프레임에 저장
print(len(data))

200000


In [12]:
sample_data = data[:100] # 임의로 100개만 저장

In [13]:
sample_data['document'] = sample_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# 한글과 공백을 제외하고 모두 제거-숫자나 ? 같은 표현

  sample_data['document'] = sample_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data['document'] = sample_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


In [18]:
print(len(sample_data))
sample_data[:10]

100


Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,디자인을 배우는 학생으로 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업...,1
2,4655635,폴리스스토리 시리즈는 부터 뉴까지 버릴께 하나도 없음 최고,1
3,9251303,와 연기가 진짜 개쩔구나 지루할거라고 생각했는데 몰입해서 봤다 그래 이런게 진짜 영화지,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화,1
5,2190435,사랑을 해본사람이라면 처음부터 끝까지 웃을수 있는영화,1
6,9279041,완전 감동입니다 다시봐도 감동,1
7,7865729,개들의 전쟁 나오나요 나오면 빠로 보고 싶음,1
8,7477618,굿,1
9,9250537,바보가 아니라 병 쉰 인듯,1


In [14]:
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [15]:
tokenizer = Mecab()
tokenized=[]
for sentence in sample_data['document']:
    temp = tokenizer.morphs(sentence) # 토큰화
    temp = [word for word in temp if not word in stopwords] # 불용어 제거
    tokenized.append(temp)

In [19]:
print(tokenized[:10])

[['어릴', '때', '보', '고', '지금', '다시', '봐도', '재밌', '어요', 'ㅋㅋ'], ['디자인', '을', '배우', '학생', '외국', '디자이너', '그', '일군', '전통', '을', '통해', '발전', '해', '문화', '산업', '부러웠', '는데', '사실', '우리', '나라', '에서', '그', '어려운', '시절', '끝', '까지', '열정', '을', '지킨', '노라노', '같', '전통', '있', '어', '저', '같', '사람', '꿈', '을', '꾸', '고', '이뤄나갈', '수', '있', '다는', '것', '감사', '합니다'], ['폴리스', '스토리', '시리즈', '부터', '뉴', '까지', '버릴', '께', '하나', '없', '음', '최고'], ['연기', '진짜', '개', '쩔', '구나', '지루', '할거', '라고', '생각', '했', '는데', '몰입', '해서', '봤', '다', '그래', '이런', '게', '진짜', '영화', '지'], ['안개', '자욱', '밤하늘', '떠', '있', '초승달', '같', '영화'], ['사랑', '을', '해', '본', '사람', '라면', '처음', '부터', '끝', '까지', '웃', '을', '수', '있', '영화'], ['완전', '감동', '입니다', '다시', '봐도', '감동'], ['개', '전쟁', '나오', '나요', '나오', '면', '빠', '로', '보', '고', '싶', '음'], ['굿'], ['바보', '아니', '라', '병', '쉰', '인', '듯']]


In [20]:
vocab = FreqDist(np.hstack(tokenized))
print(len(vocab)) # dict key: word , value: frequency

664


In [21]:
vocab = vocab.most_common(500)  # 등장 빈도수가 높은 top 50 개 단어
print(len(vocab))

500


In [22]:
word_to_index = {word[0]:index+2 for index, word in enumerate(vocab)}
word_to_index['pad']=1
word_to_index['unk']=0

In [25]:
encoded = []
for line in tokenized:
  temp=[]
  for w in line:
    try:
      temp.append(word_to_index[w])
    except KeyError:
      temp.append(word_to_index['unk'])
  encoded.append(temp)
print(encoded[:10])
print(len(encoded))

# 결과적으로 100줄의 문장들과 200,000문장 중
# 크기 502 인 vocab 을 얻음

[[79, 27, 9, 4, 50, 42, 80, 16, 28, 29], [188, 5, 81, 189, 190, 191, 43, 192, 113, 5, 193, 194, 24, 114, 195, 196, 13, 51, 82, 115, 30, 43, 197, 116, 117, 31, 198, 5, 199, 200, 17, 113, 7, 68, 52, 17, 44, 201, 5, 202, 4, 203, 14, 7, 83, 32, 204, 84], [205, 118, 206, 53, 207, 31, 208, 209, 54, 10, 25, 11], [45, 33, 119, 210, 211, 212, 213, 69, 46, 34, 13, 214, 120, 15, 2, 215, 70, 8, 33, 3, 35], [216, 217, 218, 219, 7, 220, 17, 3], [121, 5, 24, 36, 44, 122, 123, 53, 117, 31, 85, 5, 14, 7, 3], [124, 37, 221, 42, 80, 37], [119, 222, 55, 223, 55, 86, 224, 38, 9, 4, 47, 25], [56], [225, 87, 88, 226, 227, 57, 89]]
100


In [26]:
max_len= max(len(l) for l in encoded)
print(max_len)

62


In [27]:
for line in encoded:  # 62 길이로 통일시킴
  if len(line)<max_len:
    line += [word_to_index['pad']]*(max_len-len(line))

In [28]:
print(encoded[:10])

[[79, 27, 9, 4, 50, 42, 80, 16, 28, 29, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [188, 5, 81, 189, 190, 191, 43, 192, 113, 5, 193, 194, 24, 114, 195, 196, 13, 51, 82, 115, 30, 43, 197, 116, 117, 31, 198, 5, 199, 200, 17, 113, 7, 68, 52, 17, 44, 201, 5, 202, 4, 203, 14, 7, 83, 32, 204, 84, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [205, 118, 206, 53, 207, 31, 208, 209, 54, 10, 25, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [45, 33, 119, 210, 211, 212, 213, 69, 46, 34, 13, 214, 120, 15, 2, 215, 70, 8, 33, 3, 35, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [216, 217, 218, 219, 7, 220, 17, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [29]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [30]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [39]:
import spacy
import sys
from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext import data
from torchtext.datasets import Multi30k
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

sys.path.append('drive/MyDrive/mlcs/minGPT/')
from trainer_min import Trainer
from utils import CfgNode, set_seed
from model_min import GPT


In [68]:
RANDOM_STATE = 24
BATCH_SIZE = 32

In [40]:
import urllib

url = 'https://storage.googleapis.com/download.tensorflow.org/data/bbc-text.csv'
urllib.request.urlretrieve(url, 'bbc-text.csv')

('bbc-text.csv', <http.client.HTTPMessage at 0x78f4dd89f0d0>)

In [41]:
import pandas as pd
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [44]:
from torchtext.data.utils import get_tokenizer
from nltk.tokenize import word_tokenize

tokenizer = get_tokenizer('basic_english',language='en')

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [46]:
TEXT = data.Field(sequential=True,
                  tokenize=word_tokenize,
                  fix_length=120,
                  lower=True,
                  batch_first = True
                  )
LABEL = data.Field(sequential=False)

In [47]:
fields = {
    'text':('text',TEXT),
    'category':('label',LABEL)
}

In [49]:
dataset = TabularDataset(path='bbc-text.csv',format='CSV',fields=fields,skip_header=False)

In [61]:
import random
train_data, test_data = dataset.split(split_ratio=0.8,stratified=True,strata_field='label',random_state=random.seed(RANDOM_STATE))
print(len(train_data),len(test_data))


1781 444


In [62]:
TEXT.build_vocab(train_data, max_size=1000, min_freq=5,vectors='glove.6B.100d')

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18998.93it/s]


In [64]:
NUM_VOCABS = len(TEXT.vocab.stoi)
NUM_VOCABS

1002

In [77]:
#TEXT.vocab.stoi

In [69]:
import torch
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    sort=False,
    repeat=False,
    batch_size=BATCH_SIZE,  # 32
    device=device
)

In [74]:
print(len(train_iterator),len(test_iterator))

56 14


In [76]:
sample_data = next(iter(train_iterator))
sample_data.text

AttributeError: ignored

In [None]:
# HAVE TO DEFINE
MAX_ITERS = 100

N_LAYER = None
N_HEAD = None
N_EMBD = None
VOCAB_SIZE = 7852
BLOCK_SIZE = 100

# ALREADY DEFINED
NUM_WORKERS = 2
BATCH_SIZE = 64
LR = 3e-4
BETAS = (0.9, 0.95)
WEIGHT_DECAY = 0.1
GRAD_NORM_CLIP = 1.0

MODEL_TYPE = 'gpt-nano'
EMBD_PDROP = 0.1
RESI_PDROP = 0.1
ATTN_PDROP = 0.1


In [None]:
trainer_config = Trainer.get_default_config()
model_config = GPT.get_default_config()

trainer_config.max_iters = MAX_ITERS
model_config.n_layer = N_LAYER
model_config.n_head = N_HEAD
model_config.n_embd = N_EMBD
model_config.vocab_size = VOCAB_SIZE
model_config.block_size = BLOCK_SIZE
model_config.model_type = MODEL_TYPE

def merge_cfg_nodes(cfg1, cfg2):
  cfg1_dict = cfg1.to_dict()
  cfg2_dict = cfg2.to_dict()
  merged_dict = {**cfg1_dict, **cfg2_dict}
  merged_cfg = CfgNode()
  merged_cfg.merge_from_dict(merged_dict)
  return merged_cfg

merged_cfg = merge_cfg_nodes(trainer_config, model_config)
print(merged_cfg)
dataset = train_dataset
model = GPT(merged_cfg)

In [None]:
from utils import set_seed
import pickle
set_seed(3407)

In [None]:
class SortTextDataset(Dataset):
  def __init__(self, split, vocab_size, idx):
    assert split in {'train', 'test'}
    self.split = split
    self.vocab_size = vocab_size
    self.idx = idx
    self.src_len = idx.index(3)+1

  def __len__(self):
    return 10000

  def get_vocab_size(self):
    return self.vocab_size

  def get_block_size(self):
    return self.src_len-1

  def __getitem__(self):
    sos_index = 0
    eos_index = self.idx.index(3)

    x = self.idx[:eos_index]
    y = self.idx[1:eos_index+1]

    x = torch.Tensor(x).clone()
    y = torch.Tensor(y).clone()
    return x,y

In [None]:
import numpy as np

def get_batch(iterator):
  batch_arr_list=[]
  for batch in (iterator):
    src = batch.src.cpu().clone()
    src_np = src.numpy()
    print(src_np.shape)
    minus_len = 50-src_np.shape[1]
    padding_list = [1]*minus_len
    batch_arr_list.append(src_np)
    batch_arr_list.append(padding_list)

  return np.concatenate(batch_arr_list, axis=0)

In [None]:
arr = get_batch(train_iterator)

In [None]:
train_set = SortTextDataset('train',len(SRC.vocab),get_batch(train_iterator))
x,y=train_set.__getitem__()
for a, b in zip(x,y):
    print(int(a),int(b))

In [None]:
trainer = Trainer(merged_cfg, model, train_set)

running on device cuda


In [None]:
def batch_end_callback(trainer):
  if trainer.iter_num%10==0:
    print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()



TypeError: ignored