## Set-up Configuration

In [1]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import TFAutoModelWithLMHead, AutoTokenizer
from transformers import pipeline
from pprint import pprint
from transformers import *


@dataclass
class Config:
    MAX_LEN = 320
    BATCH_SIZE = 16  # per TPU core
    TOTAL_STEPS = 2000  # thats approx 4 epochs
    EVALUATE_EVERY = 200
    LR = 1e-5
    PRETRAINED_MODEL = "bert-base-uncased" # huggingface bert model 


flags = Config()
AUTO = tf.data.experimental.AUTOTUNE


"""
## Set-up TPU Runtime
"""


def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print("Running on TPU ", tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    global_batch_size = flags.BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size


tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)



Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


## Load Data

In [2]:


!wget https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip
!unzip ./Embold_Participant\'s_Dataset.zip -d Dataset


--2020-10-09 00:12:33--  https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip
Resolving machinehack-be.s3.amazonaws.com (machinehack-be.s3.amazonaws.com)... 52.219.64.40
Connecting to machinehack-be.s3.amazonaws.com (machinehack-be.s3.amazonaws.com)|52.219.64.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102320961 (98M) [application/octet-stream]
Saving to: ‘Embold_Participant's_Dataset.zip’


2020-10-09 00:12:45 (8.83 MB/s) - ‘Embold_Participant's_Dataset.zip’ saved [102320961/102320961]

Archive:  ./Embold_Participant's_Dataset.zip
   creating: Dataset/Embold_Participant's_Dataset/
  inflating: Dataset/Embold_Participant's_Dataset/sample submission.csv  
  inflating: Dataset/__MACOSX/Embold_Participant's_Dataset/._sample submission.csv  
  inflating: Dataset/Embold_Participant's_Dataset/embold_train_extra.json  
  inflating: Dataset/__MACOSX/Embold_Participant's_Datase

In [3]:
cd "Dataset/Embold_Participant's_Dataset/"

/kaggle/working/Dataset/Embold_Participant's_Dataset


In [4]:

train_df = pd.read_json("embold_train.json").reset_index(drop=True)
test_df = pd.read_json("embold_test.json").reset_index(drop=True)
train_ex_df = pd.read_json("embold_train_extra.json")
train_data = train_df.append(train_ex_df)
test_df['text'] = test_df['title']+' '+test_df['body']
train_data['text'] = train_data['title']+' '+train_data['body']

data = train_data[['text']].append(test_df[['text']])


In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data = data.sample(200000)

In [7]:
data.head()

Unnamed: 0,text
19299,fix \ rubocop/style/numericpredicate\ issue i...
60511,xl: server crash during cp using aws cli may ...
190768,cattle_forward chain gets reordered on docker ...
41751,--remote-name doesn't honor the filename after...
37986,add paging to all other pages not just book l...


## Prepare Masked Language Dataset

In [8]:


def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_mask=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        truncation=True
    )
    
    return np.array(enc_di['input_ids'])

tokenizer = BertTokenizerFast.from_pretrained(flags.PRETRAINED_MODEL)
X_data = regular_encode(data.text.values.tolist(), tokenizer, maxlen=flags.MAX_LEN)


def prepare_mlm_input_and_labels(X):
    # 15% BERT masking
    inp_mask = np.random.rand(*X.shape)<0.15 
    # do not mask special tokens
    inp_mask[X<=2] = False
    # set targets to -1 by default, it means ignore
    labels =  -1 * np.ones(X.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = X[inp_mask]
    
    # prepare input
    X_mlm = np.copy(X)
    # set input to [MASK] which is the last token for the 90% of tokens
    # this means leaving 10% unchanged
    inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
    X_mlm[inp_mask_2mask] = tokenizer.mask_token_id  # mask token is the last in the dict

    # set 10% to a random token
    inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
    X_mlm[inp_mask_2random] = np.random.randint(3, tokenizer.mask_token_id, inp_mask_2random.sum())
    
    return X_mlm, labels


# use validation and test data for mlm
X_train_mlm = np.vstack(X_data)
# masks and labels
X_train_mlm, y_train_mlm = prepare_mlm_input_and_labels(X_train_mlm)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [9]:
X_train_mlm.shape, y_train_mlm.shape

((200000, 320), (200000, 320))

## Create MaskedLanguageModel using huggingface transformers

In [10]:

def masked_sparse_categorical_crossentropy(y_true, y_pred):
    y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
    y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        y_true_masked, y_pred_masked, from_logits=True
    )
    return loss

class MaskedLanguageModel(tf.keras.Model):

  def train_step(self, inputs):
    features, labels = inputs

    with tf.GradientTape() as tape:

      predictions = self(features, training=True)[0]
      loss = masked_sparse_categorical_crossentropy(labels, predictions)

    # Compute gradients
    trainable_vars = self.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)

    # Update weights
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))

    # Compute our own metrics
    loss_tracker.update_state(loss)

    # Return a dict mapping metric names to current value
    return {"loss": loss_tracker.result()}

with strategy.scope():
  loss_tracker = tf.keras.metrics.Mean(name="loss")
  input_layer = tf.keras.layers.Input((flags.MAX_LEN, ), dtype=tf.int32)
  bert_model = TFAutoModelWithLMHead.from_pretrained(flags.PRETRAINED_MODEL)
  output_layer = bert_model(input_layer)
  mlm_model = MaskedLanguageModel(input_layer, output_layer)

  optimizer = tf.keras.optimizers.Adam(learning_rate=flags.LR)
  mlm_model.compile(optimizer=optimizer)

mlm_model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "masked_language_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 320)]             0         
_________________________________________________________________
tf_bert_for_masked_lm (TFBer ((None, 320, 30522),)     110104890 
Total params: 110,104,890
Trainable params: 110,104,890
Non-trainable params: 0
_________________________________________________________________


## Train and Save

In [11]:

mlm_model.fit(X_train_mlm, y_train_mlm, epochs=3, batch_size=global_batch_size)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f39b77998d0>

In [12]:
!mkdir github_bert_uncased

In [13]:
# Save trained model using transfomers .save_pretrained()
bert_model.save_pretrained("./github_bert_uncased/")

In [14]:
!ls github_bert_uncased/

config.json  tf_model.h5
