In [9]:
import random
import numpy as np
from conllu import parse
from collections import defaultdict
import torch

random.seed(2020)
train = parse(open('id_gsd-ud-train.conllu', 'r').read())
dev = parse(open('id_gsd-ud-dev.conllu', 'r').read())
test = parse(open('id_gsd-ud-test.conllu', 'r').read())

In [10]:
print('train:',len(train))
print('dev:',len(dev))
print('test:',len(test))

train: 4477
dev: 559
test: 557


In [11]:
# Format change: Word_BERT_sentenceID_wordID
word2index = {}
idx=0
words = defaultdict(int)
for sentence in train:
    suffix = '_BERT_'+sentence.metadata['sent_id']
    for word in sentence:
        cur_suffix = suffix + '_' + str(word['id'])
        new_word = word['form'] + cur_suffix  
        word['form'] = new_word
        word2index[new_word]=idx
        idx+=1
for sentence in test:
    suffix = '_BERT_'+sentence.metadata['sent_id']
    for word in sentence:
        cur_suffix = suffix + '_' + str(word['id'])
        new_word = word['form'] + cur_suffix  
        word['form'] = new_word
        word2index[new_word]=idx
        idx+=1
for sentence in dev:
    suffix = '_BERT_'+sentence.metadata['sent_id']
    for word in sentence:
        cur_suffix = suffix + '_' + str(word['id'])
        new_word = word['form'] + cur_suffix  
        word['form'] = new_word
        word2index[new_word]=idx
        idx+=1
print(idx)

121923


In [12]:
import json, os
os.makedirs('data/')
with open('data/word2index.json', 'w') as file:
    json.dump(word2index, file)
with open('data/word2index.json', 'r') as file:
    new_d = json.load(file)

In [16]:
def extract_bert(sentence, model, tokz, save_to, word2index):
    os.makedirs(save_to, exist_ok=True)
    indexes = []
    subtokens = ['[CLS]']
    for word in sentence:
        indexes.append(len(subtokens))
        real_word = word['form'].split('_BERT_')[0]
        subtoken = tokz.tokenize(real_word)
        subtokens += subtoken
    subtokens += ['[SEP]']
    
    subtoken_ids = tokz.convert_tokens_to_ids(subtokens)
    segment_ids = [0] * len(subtokens)
    
    subtoken_ids_t = torch.tensor(subtoken_ids).unsqueeze(0)
    segment_ids_t = torch.tensor(segment_ids).unsqueeze(0)
    
    output, _ = model(input_ids=subtoken_ids_t, token_type_ids=segment_ids_t, return_dict=False)
    output = output.view(len(subtoken_ids), model.config.hidden_size)
    indexes = torch.tensor(indexes)
    selected_output = torch.index_select(output, 0, indexes)
    
    assert len(indexes) == len(sentence) == selected_output.shape[0]
    for idx in range(len(sentence)):
        word = sentence[idx]['form']
        array = selected_output[idx].data.numpy()
        np.save(save_to+str(word2index[word]), array)

In [17]:
#Extract mBERT embedding
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
save_to='data/mbert/'

for sentence in train:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in test:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in dev:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [None]:
#Extract IndoBERT embedding
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased', do_lower_case=True)
bert = BertModel.from_pretrained('indolem/indobert-base-uncased')
save_to='data/indobert/'

for sentence in train:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in test:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in dev:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)

In [None]:
#Extract MalayBERT embedding, Only cased-malay-bert is available
from transformers import AlbertTokenizer, BertModel

tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/bert-base-bahasa-cased', 
                unk_token = '[UNK]', pad_token='[PAD]', do_lower_case=False)
bert = BertModel.from_pretrained('huseinzol05/bert-base-bahasa-cased')
save_to='data/malaybert/'

for sentence in train:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in test:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)
for sentence in dev:
    extract_bert(sentence, bert, tokenizer, save_to, word2index)

In [None]:
def write(conlls, fname):
    f = open(fname, 'w')
    for conll in conlls:
        f.write(conll.serialize())
    f.close()
    
write(train, 'data/train.conllu')
write(test, 'data/test.conllu')
write(dev, 'data/dev.conllu')