In [1]:
import os

out = 'xlnet-base-bahasa-standard-cased'
os.makedirs(out, exist_ok=True)

In [2]:
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [4]:
tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)
tokenizer.save_pretrained('xlnet-base-bahasa-standard-cased')

('xlnet-base-bahasa-standard-cased/spiece.model',
 'xlnet-base-bahasa-standard-cased/special_tokens_map.json',
 'xlnet-base-bahasa-standard-cased/added_tokens.json')

In [5]:
tokenizer = XLNetTokenizer.from_pretrained('./xlnet-base-bahasa-standard-cased', do_lower_case = False)

In [6]:
!transformers-cli convert --model_type xlnet \
  --tf_checkpoint xlnet-base/model.ckpt-500000 \
  --config xlnet-base/xlnet-base_config.json \
  --pytorch_dump_output xlnet-base-bahasa-standard-cased

INFO:transformers.modeling_xlnet:Loading TF weight global_step with shape []
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias/adam_m with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias/adam_v with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta with shape [768]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta/adam_m with shape [768]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta/adam_v with shape [768]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/gamma with shape [768]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/gamma/adam_m with shape [768]
INFO:transformers.modeling_xlnet:Loading TF weight model/tr

In [7]:
directory = 'xlnet-base-bahasa-standard-cased'
config = XLNetConfig(f'{directory}/config.json')
config.vocab_size = 32000
config.d_inner = 3072
config.d_model = 768
config.n_head = 12
config.n_layer = 12

In [8]:
model = AutoModelWithLMHead.from_pretrained('./xlnet-base-bahasa-standard-cased/pytorch_model.bin', config = config)



In [9]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [10]:
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan.<sep><cls>',
  'score': 0.18475300073623657,
  'token': 9,
  'token_str': '.'},
 {'sequence': 'makan ayam dengan<eod><sep><cls>',
  'score': 0.179636612534523,
  'token': 7,
  'token_str': '<eod>'},
 {'sequence': 'makan ayam dengan <sep><cls>',
  'score': 0.14987488090991974,
  'token': 19,
  'token_str': '▁'},
 {'sequence': 'makan ayam dengannya<sep><cls>',
  'score': 0.11687928438186646,
  'token': 26,
  'token_str': 'nya'},
 {'sequence': 'makan ayam dengan,<sep><cls>',
  'score': 0.05017939582467079,
  'token': 21,
  'token_str': ','}]

In [11]:
model.save_pretrained('xlnet-base-bahasa-standard-cased')

In [None]:
# !transformers-cli upload ./xlnet-base-bahasa-standard-cased

In [None]:
model = XLNetModel.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False)

In [None]:
import torch

In [None]:
input_ids = torch.tensor([tokenizer.encode("husein tk suka mkan ayam", add_special_tokens=True)])

In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]
    
last_hidden_states


In [None]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan <mask>')