# AlbertForMaskedLM tutorial

## 01. AlbertConfig

In [1]:
from transformers import AlbertConfig, AlbertForMaskedLM, BertTokenizer

In [2]:
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
)

## 02. AlbertTokenizer

In [3]:
import dill

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [4]:
class SentDataset(Dataset):
    
    def __init__(self, path, prefix="train", is_sample=False):
        
        with open(path, 'rb') as f:
            self.corpus = dill.load(f)
            if is_sample:
                self.corpus = self.corpus[:1000]
    
    def __len__(self):
        """Returns the number of corpus."""
        return len(self.corpus)
    
    def __getitem__(self, idx):
        return self.corpus[idx]

In [5]:
# Dataset
corpus_path = '../data/corpus/kowiki_corpus.pkl'
dataset = SentDataset(corpus_path, prefix='train', is_sample=False)

# tokenizer
tokenizer = BertTokenizer(vocab_file='../data/tokenizers/vocab.txt', max_len=128)

## 03. AlbertForMaskedLM  

In [6]:
model = AlbertForMaskedLM(albert_base_configuration)

In [7]:
tmp_text = "지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다."

encoded = tokenizer.encode(tmp_text, add_special_tokens=True)
print(encoded)

[2, 18026, 28166, 1216, 7395, 1282, 2525, 1336, 16691, 2872, 1023, 1242, 16540, 3309, 18, 3]


In [10]:
input_ids = torch.tensor(encoded).unsqueeze(0)
# input_ids

In [13]:
outputs = model(input_ids, masked_lm_labels=input_ids)

In [15]:
loss, prediction_scores = outputs[:2]

In [19]:
loss.backward()

In [20]:
from fairseq.optim.adafactor import Adafactor

In [21]:
optimizer = Adafactor(model.parameters())

In [22]:
optimizer.step()