In [None]:
import setGPU, numpy as np, glob, csv
from transformers import BertTokenizer, BertModel
import torch, torch.nn as nn, torch.nn.functional as F

In [None]:
BERT_NAMES = ['bert-base-german-cased', 
             'bert-base-german-dbmdz-cased', 
             'bert-base-german-dbmdz-uncased', 
             'distilbert-base-german-cased',
            ]
SOURCE_PATH = '../../Dataset/Text-Data/*-folds.csv'
TARGET_PATH = '../../Dataset/Embeddings/{}-{}.npz'
DEVICE = 'cuda'

In [None]:
for bert_name in BERT_NAMES:
    print('### Load Model: {}...'.format(bert_name))
    bert_model = BertModel.from_pretrained(bert_name, hidden_dropout_prob=0.).to(DEVICE).eval()
    bert_tokenizer = BertTokenizer.from_pretrained(bert_name)
    for source in glob.glob(SOURCE_PATH):
        ### Define usefull variables..
        target = TARGET_PATH.format(source.split('/')[-1][:-4], bert_name)
        encodings = []
        labels = []
        five_folds = []
        ten_folds = []
        train_test_split = []
        embeddings = []
        
        ### Load Data from csv
        print('Load Dataset: {}...'.format(source.split('/')[-1]))
        with open(source, encoding="utf-8") as f_source:
            reader = csv.DictReader(f_source)
            for row in reader:
                encodings.append(row['text'])
                labels.append(float(row['label']))
                five_folds.append(int(row['five_folds']))
                ten_folds.append(int(row['ten_folds']))
                train_test_split.append(int(row['train_test_split']))
        
        ### Prepare Encodings and execute Bert
        print('Execute Bert...')
        encodings = bert_tokenizer(encodings, truncation=True, padding=True)
        for idx in range(len(encodings['input_ids'])):
            inp = {key : torch.tensor(val[idx]).unsqueeze(0).to(DEVICE) for key, val in encodings.items()}
            with torch.no_grad():
                emb = bert_model(**inp)['pooler_output'].cpu().numpy()[0]
                embeddings.append(emb)
        
        ### Postprocessing and save data
        print('Savez Dataset: {}'.format(target))
        embeddings = np.stack(embeddings)
        labels = np.array(labels)
        five_folds = np.array(five_folds)
        ten_folds = np.array(ten_folds)
        train_test_split = np.array(train_test_split)
        np.savez_compressed(target, 
                            embeddings=embeddings, 
                            labels=labels, 
                            five_folds=five_folds,
                            ten_folds=ten_folds,
                            train_test_split=train_test_split
                           )
    del bert_model, bert_tokenizer
    torch.cuda.empty_cache()