In [1]:
!pip install -q -U tensorflow-text

[K     |████████████████████████████████| 2.6MB 7.2MB/s 
[?25h

In [2]:
!pip install -q -U tf-models-official

[K     |████████████████████████████████| 849kB 5.9MB/s 
[K     |████████████████████████████████| 36.7MB 1.3MB/s 
[K     |████████████████████████████████| 358kB 38.5MB/s 
[K     |████████████████████████████████| 1.1MB 34.2MB/s 
[K     |████████████████████████████████| 102kB 11.3MB/s 
[K     |████████████████████████████████| 174kB 38.5MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


In [3]:
import os
import json
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # A dependency of the preprocessing model
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')

In [4]:
if os.environ['COLAB_TPU_ADDR']:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recomended.')

Using TPU


In [5]:
# Select BERT model
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'

In [6]:
# Download dataset
url = 'https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/train.jsonl'
dataset_path = tf.keras.utils.get_file('train.jsonl', url)

Downloading data from https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/train.jsonl


In [7]:
# Preprocess data into tf.data.Dataset
def preprocess_dataset(dataset_path, split):
  """Processes Twitter sarcasm data into tf.data.Dataset.
    
  Args:
    dataset_path: str path of jsonl dataset.
    split: str designating train or test dataset, either 'train' or 'test'.
    
  Returns:
    A tf.data.Dataset of the Twitter sarcasm data, retaining only the response
    tweet, the last context tweet, and the label if present.
  """
  
  with open(dataset_path, 'r') as file:
    dict_list = [json.loads(line) for line in file.readlines()]

  if split == 'train':
    features = {'response': [], 'context': [], 'label': []}
    for d in dict_list:
      features['response'].append(d['response'])
      features['context'].append(d['context'][-1])  # Only use last context element for simplicity
      if d['label'] == 'SARCASM':
        features['label'].append(1)
      else:
        features['label'].append(0)
  else:
    features = {'response': [], 'context': []}
    for d in dict_list:
      features['response'].append(d['response'])
      features['context'].append(d['context'][-1])  # Only use last context element for simplicity

  return tf.data.Dataset.from_tensor_slices(features)

dataset = preprocess_dataset(dataset_path, 'train')

In [8]:
# Shuffle dataset and split into train and validation sets
train_ratio = 0.8
dataset_size = tf.data.experimental.cardinality(dataset).numpy()

dataset = dataset.shuffle(dataset_size)
in_memory_train_ds = dataset.take(int(train_ratio * dataset_size))
train_size = tf.data.experimental.cardinality(in_memory_train_ds).numpy()
in_memory_val_ds = dataset.skip(int(train_ratio * dataset_size))
val_size = tf.data.experimental.cardinality(in_memory_val_ds).numpy()

In [9]:
# BERT preprocessing
def make_bert_preprocess_model(sentence_features, seq_length=128):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(tfhub_handle_preprocess)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [10]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def load_dataset(in_memory_ds, split, batch_size, bert_preprocess_model):
  ds = in_memory_ds
  if split == 'train':
    ds = ds.shuffle(train_size)
    ds = ds.repeat()
  ds = ds.batch(batch_size)
  ds = ds.map(lambda x: (bert_preprocess_model(x), x['label']))
  ds = ds.cache().prefetch(buffer_size=AUTOTUNE)
  return ds

In [11]:
# Define model
def build_classifier_model(num_classes):
  inputs = dict(
      input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
      input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
      input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
  )

  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
  net = encoder(inputs)['pooled_output']
  net = tf.keras.layers.Dropout(rate=0.1)(net)
  net = tf.keras.layers.Dense(num_classes, activation=None, name='classifier')(net)
  return tf.keras.Model(inputs, net, name='prediction')

In [12]:
# Train model
epochs = 10
batch_size = 32
init_lr = 2e-5

print(f'Fine tuning {tfhub_handle_encoder} model')
bert_preprocess_model = make_bert_preprocess_model(['context', 'response'])

with strategy.scope():

  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metrics = tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)

  train_dataset = load_dataset(in_memory_train_ds, 'train', batch_size, bert_preprocess_model)
  steps_per_epoch = train_size // batch_size
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = num_train_steps // 10

  val_dataset = load_dataset(in_memory_val_ds, 'val', batch_size, bert_preprocess_model)
  val_steps = val_size // batch_size

  classifier_model = build_classifier_model(num_classes=2)

  optimizer = optimization.create_optimizer(
      init_lr=init_lr,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type='adamw')

  classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

  classifier_model.fit(
      x=train_dataset,
      validation_data=val_dataset,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=val_steps)

Fine tuning https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3 model


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# Export model
main_save_path = './my_models'
saved_model_name = 'cs_410_text_classification_competition'
saved_model_path = os.path.join(main_save_path, saved_model_name)

preprocess_inputs = bert_preprocess_model.inputs
bert_encoder_inputs = bert_preprocess_model(preprocess_inputs)
bert_outputs = classifier_model(bert_encoder_inputs)
model_for_export = tf.keras.Model(preprocess_inputs, bert_outputs)

print(f'Saving {saved_model_path}')

# Save everything on the Colab host (even the variables from TPU memory)
save_options = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model_for_export.save(saved_model_path, include_optimizer=False, options=save_options)

Saving ./my_models/cs_410_text_classification_competition


In [14]:
# Reload model
load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
reloaded_model = tf.saved_model.load(saved_model_path, options=load_options)

In [15]:
# Download and preprocess test dataset
test_url = 'https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/test.jsonl'
test_path = tf.keras.utils.get_file('test.jsonl', test_url)

test_dataset = preprocess_dataset(test_path, 'test')

Downloading data from https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/test.jsonl


In [16]:
# Utility methods for testing
def prepare(record):
  """Prepares records from processed dataset for prediction.
    
  Args:
    record: dict of str Tensors.
    
  Returns:
    A list of lists of str Tensors.
  """
  model_inputs = [[record[ft]] for ft in ['context', 'response']]
  return model_inputs

def get_result(test_row, model):
  """Predicts whether a Twitter sarcasm test example is sarcasm or not sarcasm.
    
  Args:
    test_row: list of str Tensors.
    model: TensorFlow SavedModel for the sarcasm classifier.
    
  Returns:
    A str, either 'SARCASM' or 'NOT_SARCASM', corresponding to the predicted result.
  """
  
  raw_result = model(list(test_row))
  if tf.argmax(raw_result, axis=1)[0] == 1:
    result_class = 'SARCASM'
  else:
    result_class = 'NOT_SARCASM'
  return result_class

def print_result(test_row, model):
  """Prints out the context, response, and predicted label for a Twitter sarcasm test example.
    
  Args:
    test_row: list of str Tensors.
    model: TensorFlow SavedModel for the sarcasm classifier.
    
  Returns:
    None.
  """
  
  label = get_result(test_row, model)
  print(f'context: {test_row[0]}')
  print(f'response: {test_row[1]}')
  print(f'prediction: {label}')
  print()

In [17]:
# Print some sample test set results
test_size = tf.data.experimental.cardinality(test_dataset).numpy()

for test_row in test_dataset.shuffle(test_size).map(prepare).take(5):
  print_result(test_row, reloaded_model)

context: [b'@USER @USER @USER It \xe2\x80\x99 s obvious I \xe2\x80\x99 m dealing with a double digit IQ . Have a good life .']
response: [b'@USER @USER @USER Hahahahahah What a chump . No testicular fortitude at all . It \xe2\x80\x99 s unsurprising that liberals lose with people like this . <URL>']
prediction: SARCASM

context: [b'@USER @USER asked me to respond to @USER . See attached . Thanks for the opportunity . #KXL <URL>']
response: [b'@USER @USER @USER Imagine that . A politician making baseless accusations . Because has * never * done that before .']
prediction: SARCASM

context: [b'@USER @USER By all means you should initiate another failed impeachment , causing further embarrassment ( if that were even possible ) to your party , then go tear up some official documents like a toddler .']
response: [b'@USER @USER @USER Yet you have no shame in supporting the biggest criminal in White House history .']
prediction: SARCASM

context: [b'@USER @USER Aaaayyyyyeeee I \xe2\x80\x99 m t

In [18]:
# Run model on test dataset and save results to file
with open('./answer.txt', 'w') as file:
  for i, test_row in enumerate(test_dataset.map(prepare)):
    label = get_result(test_row, reloaded_model)
    file.write('twitter_' + str(i + 1) + ',' + label + os.linesep)