In [1]:
import os

out = 'electra-small-discriminator-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [2]:
from transformers import ElectraTokenizer, ElectraModel, ElectraConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [3]:
tokenizer = ElectraTokenizer('bahasa.wordpiece', do_lower_case = False)
tokenizer.save_pretrained('electra-small-discriminator-bahasa-cased')

('electra-small-discriminator-bahasa-cased/vocab.txt',
 'electra-small-discriminator-bahasa-cased/special_tokens_map.json',
 'electra-small-discriminator-bahasa-cased/added_tokens.json')

In [4]:
import logging

import torch

from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
    # Initialise PyTorch model
    config = ElectraConfig.from_json_file(config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    if discriminator_or_generator == "discriminator":
        model = ElectraForPreTraining(config)
    elif discriminator_or_generator == "generator":
        model = ElectraForMaskedLM(config)
    else:
        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")

    # Load weights from tf checkpoint
    load_tf_weights_in_electra(
        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
    )

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)

In [5]:
convert_tf_checkpoint_to_pytorch('dataset/models/bahasa-small/model.ckpt-150000', 
                                 'SMALL-config-discriminator.json', 
                                 'electra-small-discriminator-bahasa-cased/pytorch_model.bin',
                                'discriminator')

Building PyTorch model from configuration: ElectraConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 128,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 4,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,

INFO:transformers.modeling_electra:Converting TensorFlow checkpoint from /home/husein/electra/electra/dataset/models/bahasa-small/model.ckpt-150000
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias with shape [256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_m with shape [256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_v with shape [256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel with shape [256, 256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_m with shape [256, 256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_v with shape [256, 256]
INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias with shape [1]
INFO:transformers.modeling_electra:Loading TF weight d

Initialize PyTorch weight ['discriminator_predictions', 'dense', 'bias'] discriminator_predictions/dense/bias
Skipping discriminator_predictions/dense/bias/adam_m ['discriminator_predictions', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'
Skipping discriminator_predictions/dense/bias/adam_v ['discriminator_predictions', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'
Initialize PyTorch weight ['discriminator_predictions', 'dense', 'kernel'] discriminator_predictions/dense/kernel
Skipping discriminator_predictions/dense/kernel/adam_m ['discriminator_predictions', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'
Skipping discriminator_predictions/dense/kernel/adam_v ['discriminator_predictions', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'
Initialize PyTorch weight ['discriminator_predictions', 'dense_prediction', 'bias'] discriminator_predictions/dense_1/bias
Skipping discriminator_pr

INFO:transformers.modeling_electra:Skipping global_step


 ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'
Skipping generator/encoder/layer_10/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'
Skipping generator/encoder/layer_10/attention/self/value/bias ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'
Skipping generator/encoder/layer_10/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'
Skipping generator/encoder/layer_10/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'
Skipping 

In [6]:
tokenizer = ElectraTokenizer.from_pretrained('./electra-small-discriminator-bahasa-cased', do_lower_case = False)

INFO:transformers.tokenization_utils:Model name './electra-small-discriminator-bahasa-cased' not found in model shortcut name list (google/electra-small-generator, google/electra-base-generator, google/electra-large-generator, google/electra-small-discriminator, google/electra-base-discriminator, google/electra-large-discriminator). Assuming './electra-small-discriminator-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./electra-small-discriminator-bahasa-cased/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:loading file ./electra-small-discriminator-bahasa-cased/vocab.txt
INFO:transformers.tokenization_utils:loading file None
INFO:transformers.tokenization_utils:loading file ./electra-small-discriminator-bahasa-cased/special_tokens_map.json
INFO:transformers.tokenization_utils:loading file ./electra-small-discriminator-bahasa-cased/tokenizer_config.json


In [7]:
config = ElectraConfig('SMALL-config-discriminator.json')
config.vocab_size = 32000
config.hidden_size = 256
config.intermediate_size = 1024
config

ElectraConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "embedding_size": 128,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "electra",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 4,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0

In [8]:
from transformers import ElectraForPreTraining, ElectraTokenizerFast
import torch

discriminator = ElectraForPreTraining.from_pretrained('./electra-small-discriminator-bahasa-cased/pytorch_model.bin',
                                                     config = config)

sentence = '1mbd menjejaskan imej negara'
fake_sentence = '1mbd menaikkan imej negara'

fake_tokens = tokenizer.tokenize(fake_sentence)
fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
discriminator_outputs = discriminator(fake_inputs)
predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

INFO:transformers.modeling_utils:loading weights file ./electra-small-discriminator-bahasa-cased/pytorch_model.bin


In [9]:
discriminator.save_pretrained('electra-small-discriminator-bahasa-cased')

INFO:transformers.configuration_utils:Configuration saved in electra-small-discriminator-bahasa-cased/config.json
INFO:transformers.modeling_utils:Model weights saved in electra-small-discriminator-bahasa-cased/pytorch_model.bin


In [10]:
# !transformers-cli upload ./electra-small-discriminator-bahasa-cased

In [12]:
model = ElectraForPreTraining.from_pretrained('huseinzol05/electra-small-discriminator-bahasa-cased')
tokenizer = ElectraTokenizer.from_pretrained('huseinzol05/electra-small-discriminator-bahasa-cased', 
                                             do_lower_case = False)

sentence = 'kerajaan sangat prihatin terhadap rakyat'
fake_tokens = tokenizer.tokenize(sentence)
fake_inputs = tokenizer.encode(sentence, return_tensors="pt")
discriminator_outputs = discriminator(fake_inputs)
predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

list(zip(fake_tokens, predictions.tolist()))

INFO:filelock:Lock 139851449333632 acquired on /home/husein/.cache/torch/transformers/690a9589f0655bdeaf9b91f55ab08d46a07766cfda191042a98e2e21d57ef4cb.6a339325b229ca8511309223a8ce140d9e1cb3b0da37cb3e9cdab6bb4799ddce.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-discriminator-bahasa-cased/config.json not found in cache or force_download set to True, downloading to /home/husein/.cache/torch/transformers/tmpupv_4kkz


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1268.0, style=ProgressStyle(description…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-discriminator-bahasa-cased/config.json in cache at /home/husein/.cache/torch/transformers/690a9589f0655bdeaf9b91f55ab08d46a07766cfda191042a98e2e21d57ef4cb.6a339325b229ca8511309223a8ce140d9e1cb3b0da37cb3e9cdab6bb4799ddce
INFO:transformers.file_utils:creating metadata file for /home/husein/.cache/torch/transformers/690a9589f0655bdeaf9b91f55ab08d46a07766cfda191042a98e2e21d57ef4cb.6a339325b229ca8511309223a8ce140d9e1cb3b0da37cb3e9cdab6bb4799ddce
INFO:filelock:Lock 139851449333632 released on /home/husein/.cache/torch/transformers/690a9589f0655bdeaf9b91f55ab08d46a07766cfda191042a98e2e21d57ef4cb.6a339325b229ca8511309223a8ce140d9e1cb3b0da37cb3e9cdab6bb4799ddce.lock
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-discriminator-bahasa-cased/config.json from cache at /home/husein/.cache/tor




INFO:transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-discriminator-bahasa-cased/pytorch_model.bin from cache at /home/husein/.cache/torch/transformers/64a7f90ed9cf765c106d2519b690a59328be7586a30257b2d11443cea21559ba.a70f49e2486cf3e2ea0b09e204a726415bb317aa76aa0e6777a7e06d0fc38172
INFO:transformers.tokenization_utils:Model name 'huseinzol05/electra-small-discriminator-bahasa-cased' not found in model shortcut name list (google/electra-small-generator, google/electra-base-generator, google/electra-large-generator, google/electra-small-discriminator, google/electra-base-discriminator, google/electra-large-discriminator). Assuming 'huseinzol05/electra-small-discriminator-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-discriminator-bahasa-c

[('kerajaan', 0.0),
 ('sangat', 0.0),
 ('prihatin', 0.0),
 ('terhadap', 0.0),
 ('rakyat', 0.0)]