In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

out = 'xlnet-large-bahasa-standard-cased'
os.makedirs(out, exist_ok=True)

In [2]:
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [3]:
tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)
tokenizer.save_pretrained('xlnet-large-bahasa-standard-cased')

('xlnet-large-bahasa-standard-cased/spiece.model',
 'xlnet-large-bahasa-standard-cased/special_tokens_map.json',
 'xlnet-large-bahasa-standard-cased/added_tokens.json')

In [4]:
tokenizer = XLNetTokenizer.from_pretrained('./xlnet-large-bahasa-standard-cased', do_lower_case = False)

In [5]:
!transformers-cli convert --model_type xlnet \
  --tf_checkpoint xlnet-large/model.ckpt-500000 \
  --config xlnet-large/xlnet-large_config.json \
  --pytorch_dump_output xlnet-large-bahasa-standard-cased

INFO:transformers.modeling_xlnet:Loading TF weight global_step with shape []
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias/adam_m with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/lm_loss/bias/adam_v with shape [32000]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta with shape [1024]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta/adam_m with shape [1024]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/beta/adam_v with shape [1024]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/gamma with shape [1024]
INFO:transformers.modeling_xlnet:Loading TF weight model/transformer/layer_0/ff/LayerNorm/gamma/adam_m with shape [1024]
INFO:transformers.modeling_xlnet:Loading TF weight mod

In [8]:
directory = 'xlnet-large-bahasa-standard-cased'
config = XLNetConfig(f'{directory}/config.json')
config.vocab_size = 32000
config.d_inner = 4096
config.d_model = 1024
config.n_head = 16
config.n_layer = 20

In [9]:
model = AutoModelWithLMHead.from_pretrained('./xlnet-large-bahasa-standard-cased/pytorch_model.bin', config = config)

In [10]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [11]:
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan sdm<sep><cls>',
  'score': 0.007230748888105154,
  'token': 12829,
  'token_str': '▁sdm'},
 {'sequence': 'makan ayam dengan Bentar<sep><cls>',
  'score': 0.005983198527246714,
  'token': 15544,
  'token_str': '▁Bentar'},
 {'sequence': 'makan ayam denganج<sep><cls>',
  'score': 0.003996539860963821,
  'token': 13344,
  'token_str': 'ج'},
 {'sequence': 'makan ayam dengan seperjuangan<sep><cls>',
  'score': 0.003097530920058489,
  'token': 28412,
  'token_str': '▁seperjuangan'},
 {'sequence': 'makan ayam dengan GIMANA<sep><cls>',
  'score': 0.002888893475756049,
  'token': 25453,
  'token_str': '▁GIMANA'}]

In [12]:
model.save_pretrained('xlnet-large-bahasa-standard-cased')

In [None]:
# !transformers-cli upload ./xlnet-large-bahasa-standard-cased

In [None]:
model = XLNetModel.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False)

In [None]:
import torch

In [None]:
input_ids = torch.tensor([tokenizer.encode("husein tk suka mkan ayam", add_special_tokens=True)])

In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]
    
last_hidden_states


In [None]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan <mask>')