In [1]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from multiprocessing import cpu_count, Pool

def tokenize_single(text):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="tf", max_length=512)
    return tokens

def tokenize_list(text_list):
    with Pool(cpu_count()) as pool:
        tokenized_list = pool.map(tokenize_single, text_list)
    return tokenized_list

def create_text_model():
    distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
    input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
    
    embeddings = distilBERT(input_ids, attention_mask=mask)[0]
    text_out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
    text_model = tf.keras.Model(inputs=[input_ids, mask], outputs=text_out)
    
    return text_model
