### Adopted from https://huggingface.co/transformers/custom_datasets.html#seq-imdb

In [1]:
# Download data
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2020-10-07 10:45:13--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-10-07 10:45:23 (7.90 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [1]:
from pyspark.sql import SparkSession
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFAlbertForSequenceClassification
import tensorflow as tf
import pandas as pd

from cerebro.backend import SparkBackend
from cerebro.keras import SparkEstimator
from cerebro.storage import LocalStore
from cerebro.tune import RandomSearch, GridSearch, hp_choice

In [2]:
spark = SparkSession \
    .builder \
    .master("local[3]") \
    .appName("IMDB Sequence Classification") \
    .getOrCreate()

In [3]:
pos_rdd = spark.sparkContext.wholeTextFiles("aclImdb/train/pos/*.txt").map(lambda x: [x[0], 1])
neg_rdd = spark.sparkContext.wholeTextFiles("aclImdb/train/neg/*.txt").map(lambda x: [x[0], 0])
merged_rdd = pos_rdd.union(neg_rdd)

In [4]:
def read_content(x):
    with open(x[0][5:], 'r') as f:
        return ["".join(f.readlines()), x[1]]

merged_rdd = merged_rdd.map(lambda x: read_content(x))

In [5]:
df = merged_rdd.toDF(['text', 'label'])

In [6]:
sample_fraction = 0.01
df = df.sample(False, sample_fraction, seed=0)

In [7]:
def distilbert_tokenize(rows):
    from transformers import DistilBertTokenizerFast
    from pyspark.sql import Row
    
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    texts = []
    labels = []
    for x in rows:
        texts.append(x[0])
        labels.append(x[1])
    
    encodings = tokenizer(texts, truncation=True, padding=True)
    
    for i in range(len(texts)):
        yield Row(input_ids=encodings['input_ids'][i], attention_mask=encodings['attention_mask'][i],
                  label=labels[i])
    
df = df.rdd.mapPartitions(lambda x: distilbert_tokenize(x)).toDF().cache()

In [8]:
df.show()

+--------------------+--------------------+-----+
|      attention_mask|           input_ids|label|
+--------------------+--------------------+-----+
|[1, 1, 1, 1, 1, 1...|[101, 2023, 3185,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 4315, 28681...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 3398, 2469,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 1005,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 3191,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 2018,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 3866,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 3866, 2009,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 2065, 2017,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 2023, 16596...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 3374, 1010,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 3666, 2023,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 2347,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 2054, 2062,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 2600, 5024,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 2428,...|    1|
|[1, 1, 1, 1, 1, 1...|[101, 1045, 3866,...|    1|


In [9]:
backend = SparkBackend(spark_context=spark.sparkContext)
store = LocalStore("/tmp")

search_space = {'lr': hp_choice([0.0001])}

CEREBRO => Time: 2020-10-07 11:19:17, Running 3 Workers (inferred from spark.default.parallelism)


In [10]:
def estimator_gen_fn(params):
    from tensorflow.keras.layers import Input, Flatten, Dense
    from tensorflow.keras.models import Model
    from transformers import TFDistilBertModel
    
    CUSTOM_OBJECTS = {'TFDistilBertModel': TFDistilBertModel}
    
    distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
    input_ids = Input(shape=(512,), dtype=tf.int64, name='input_ids')
    attention_mask = Input(shape=(512,), dtype=tf.float32, name='attention_mask')

    bert = distilbert_model.distilbert(input_ids, attention_mask=attention_mask)[0]
    
    flat = Flatten()(bert)
    classifier = Dense(units=1)(flat)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=classifier)
    
    optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
    loss = 'binary_crossentropy'

    keras_estimator = SparkEstimator(
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['acc'],
        batch_size=10,
        custom_objects=CUSTOM_OBJECTS)

    return keras_estimator

In [11]:
grid_search = GridSearch(backend, store, estimator_gen_fn, search_space, 5,
                         validation=0.25, evaluation_metric='loss',
                         feature_columns=['input_ids', 'attention_mask'],
                         label_columns=['label'])

In [None]:
model = grid_search.fit(df)

CEREBRO => Time: 2020-10-07 11:19:17, Preparing Data
CEREBRO => Time: 2020-10-07 11:19:17, Num Partitions: 4
CEREBRO => Time: 2020-10-07 11:19:17, Writing DataFrames
CEREBRO => Time: 2020-10-07 11:19:17, Train Data Path: file:///tmp/intermediate_train_data
CEREBRO => Time: 2020-10-07 11:19:17, Val Data Path: file:///tmp/intermediate_val_data
CEREBRO => Time: 2020-10-07 11:20:01, Train Partitions: 3
CEREBRO => Time: 2020-10-07 11:20:03, Val Partitions: 3
CEREBRO => Time: 2020-10-07 11:20:04, Train Rows: 203
CEREBRO => Time: 2020-10-07 11:20:04, Val Rows: 60
CEREBRO => Time: 2020-10-07 11:20:04, Initializing Workers
CEREBRO => Time: 2020-10-07 11:20:05, Initializing Data Loaders
CEREBRO => Time: 2020-10-07 11:20:17, Launching Model Selection Workload


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
