### Adopted from https://huggingface.co/transformers/custom_datasets.html#seq-imdb

In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import tensorflow as tf

from pyspark.sql import SparkSession
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
import numpy as np

from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator
from cerebro.storage import LocalStore
from cerebro.tune import RandomSearch, GridSearch, hp_choice

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

In [2]:
sample_fraction = 1.0

### 1. Download data

In [3]:
#!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#!tar -xf aclImdb_v1.tar.gz

### 2. Using HugginFace DistilBert model with TF a on single node

In [7]:
train_texts, train_labels = read_imdb_split('aclImdb/train')
if sample_fraction < 1.0:
    train_texts, _, train_labels, _ = train_test_split(train_texts, train_labels, test_size=1. - sample_fraction)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.25)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [8]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['acc'])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [9]:
model.fit(train_dataset.batch(16), epochs=3, validation_data=val_dataset.batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f4e97c3feb8>

### 3. Using HugginFace DistilBert model with Cerebro for Distributed Model Selection

In [3]:
# If GPU memory runs out restart the notebook and only run the imports
# and Cerebro section (3.)

In [4]:
# Change the master url (local[1]) to the correct Spark master url.
spark = SparkSession \
    .builder \
    .master("local[1]") \
    .appName("IMDB Sequence Classification") \
    .getOrCreate()

In [5]:
train_texts, train_labels = read_imdb_split('aclImdb/train')
if sample_fraction < 1.0:
    train_texts, _, train_labels, _ = train_test_split(train_texts, train_labels, test_size=1. - sample_fraction)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

In [6]:
df = spark.createDataFrame(pd.DataFrame.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': train_labels}
))

In [7]:
backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1, verbose=0)
store = LocalStore("/users/snakanda/cerista")

# Define more parameters if you want to try more model configurations.
search_space = {'lr': hp_choice([5e-5])}

In [8]:
def estimator_gen_fn(params):
    from tensorflow.keras.layers import Input
    from tensorflow.keras.models import Model
    from transformers import TFDistilBertForSequenceClassification

    # TFDistilBertForSequenceClassification model is not directly serializable. Hence we recreate the model
    # and wrap it using a serializable Keras model. Check `call` method of TFDistilBertForSequenceClassification
    # class for more details
    distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    
    # Inputs
    input_ids = Input(shape=(512,), dtype=tf.int64)
    attention_mask = Input(shape=(512,), dtype=tf.int64)

    hidden_state = distilbert_model.distilbert(input_ids, attention_mask=attention_mask, training=False)[0]
    pooled_output = hidden_state[:, 0]
    pooled_output = distilbert_model.pre_classifier(pooled_output)
    pooled_output = distilbert_model.dropout(pooled_output, training=False)
    logits = distilbert_model.classifier(pooled_output)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=logits)
    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    CUSTOM_OBJECTS = {'TFDistilBertForSequenceClassification': TFDistilBertForSequenceClassification}
    
    keras_estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss_fn,
        metrics=['acc'],
        batch_size=16,
        custom_objects=CUSTOM_OBJECTS)

    return keras_estimator

In [9]:
grid_search = GridSearch(backend, store, estimator_gen_fn, search_space, 3,
                         validation=0.25, evaluation_metric='loss',
                         feature_columns=['input_ids', 'attention_mask'],
                         label_columns=['label'],
                         verbose=1)

In [10]:
model = grid_search.fit(df)

# Or use the follwing method if the data is already materialized.
# model = grid_search.fit_on_prepared_data()

CEREBRO => Time: 2020-10-08 22:50:04, Preparing Data
CEREBRO => Time: 2020-10-08 22:50:35, Initializing Workers
CEREBRO => Time: 2020-10-08 22:50:36, Initializing Data Loaders
CEREBRO => Time: 2020-10-08 22:50:36, Launching Model Selection Workload


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

CEREBRO => Time: 2020-10-08 22:50:42, Model: model_0_1602215442, lr: 5e-05
CEREBRO => Time: 2020-10-08 23:07:14, Model: model_0_1602215442, Epoch: 1, train_loss: 3.803926835636247, train_acc: 0.9051520824432373, val_loss: 4.237147256769521, val_acc: 0.8874370455741882
CEREBRO => Time: 2020-10-08 23:20:26, Model: model_0_1602215442, Epoch: 2, train_loss: 2.160089514045094, train_acc: 0.9497643709182739, val_loss: 4.144298793687028, val_acc: 0.9039672613143921
CEREBRO => Time: 2020-10-08 23:33:36, Model: model_0_1602215442, Epoch: 3, train_loss: 0.9819089077227935, train_acc: 0.9791131019592285, val_loss: 4.785402235516373, val_acc: 0.9039672613143921
CEREBRO => Time: 2020-10-08 23:33:52, Terminating Workers
