In [17]:
%pip install --upgrade transformers tensorflow-gpu==2.9.0 --quiet
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMRobertaTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import TFAlbertForSequenceClassification, AlbertTokenizer, AlbertConfig


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
# load data
df = pd.read_csv('../Data/data.csv')  # path to multi_dataset
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")


In [26]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMRobertaTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'albert'  # --> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
# model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
config_class = 'AlbertConfig'
model_class = 'TFAlbertForSequenceClassification'
tokenizer_class = 'AlbertTokenizer'
model_name = 'albert-base-v2' 
tokenizer = AlbertTokenizer.from_pretrained(model_name, from_tf=True)


In [27]:
def input_id_maker(dataf, tokenizer):
    input_ids = []
    lengths = []

    for i in range(len(dataf)):
        sen = dataf['text'].iloc[i]
        sen = tokenizer.tokenize(sen)
        # sen = tokenizer.tokenize(sen, add_prefix_space=True)
        CLS = tokenizer.cls_token
        SEP = tokenizer.sep_token
        if (len(sen) > 510):
            sen = sen[len(sen)-510:]

        sen = [CLS] + sen + [SEP]
        encoded_sent = tokenizer.convert_tokens_to_ids(sen)
        input_ids.append(encoded_sent)
        lengths.append(len(encoded_sent))

    input_ids = pad_sequences(
        input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
    return input_ids, lengths


In [28]:
import time
start_time = time.time()
train_input_ids, train_lengths = input_id_maker(train_set, tokenizer)
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)
test_input_ids, test_lengths = input_id_maker(test_set, tokenizer)
print("--- %s seconds ---" % (time.time() - start_time))


Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_prefix_space': True} not recognized.
Keyword arguments {'add_

In [22]:
def att_masking(input_ids):
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks


In [23]:
train_attention_masks = att_masking(train_input_ids)
validation_attention_masks = att_masking(validation_input_ids)

train_labels = train_set['label'].to_numpy().astype('int')
validation_labels = validation_set['label'].to_numpy().astype('int')

test_attention_masks = att_masking(test_input_ids)
test_labels = test_set['label'].to_numpy().astype('int')


In [24]:
folder = f"{model_type}/{model_name}"

if not os.path.exists(folder):
        os.makedirs(folder)


In [25]:
np.save(f'{folder}/train_input_ids.npy', train_input_ids)
np.save(f'{folder}/train_attention_masks.npy', train_attention_masks)
np.save(f'{folder}/train_labels.npy', train_labels)

np.save(f'{folder}/validation_input_ids.npy', validation_input_ids)
np.save(f'{folder}/validation_attention_masks.npy', validation_attention_masks)
np.save(f'{folder}/validation_labels.npy', validation_labels)

np.save(f'{folder}/test_input_ids.npy', test_input_ids)
np.save(f'{folder}/test_attention_masks.npy', test_attention_masks)
np.save(f'{folder}/test_labels.npy', test_labels)
