*   **Use iitp.baban Google Drive**
*   **SEED = 42** 




# Imports

In [0]:
SEED = 42


from google.colab import drive
from google.colab import files

%tensorflow_version 1.x
import tensorflow as tf

!pip install bert-tensorflow
import bert
from bert import run_classifier

from bert import optimization
from bert import tokenization

import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [0]:
drive.mount('/content/gdrive')

# BERT Pretrained Model Download 

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip

# Dataset Loading (Text)

In [0]:
!wget https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip
!unzip XNLI-1.0.zip

In [0]:
df = pd.read_csv('XNLI-1.0/xnli.test.tsv',sep = '\t')

In [0]:
df_fr = df[(df['language'] == 'fr') ].head(5000)

In [0]:
df_de = df[(df['language'] == 'de') ].head(5000)

In [0]:
df_tr = df[(df['language'] == 'tr') ].head(5000)

In [0]:
df_bg = df[(df['language'] == 'bg') ].head(5000)

In [0]:
df_fr_train,df_fr_test = train_test_split(df_fr, test_size=0.1,random_state = SEED,shuffle = True)
df_de_train,df_de_test = train_test_split(df_de, test_size=0.1,random_state = SEED,shuffle = True)
df_tr_train,df_tr_test = train_test_split(df_tr, test_size=0.1,random_state = SEED,shuffle = True)
df_bg_train,df_bg_test = train_test_split(df_bg, test_size=0.1,random_state = SEED,shuffle = True)

In [0]:
def get_data(a,b,lang_pair):
  pairs = lang_pair.split('-')
  b_ = list(a['gold_label'])
  assert b_== list(b['gold_label'])
  lab = []
  """
  lab  = []
  for i in b_:
    lab.append(i-1)
  """
  for i in b_:
    if i=='contradiction':
        lab.append(0)
        
    elif i=='neutral':
        lab.append(1)
    elif i== 'entailment':
        lab.append(2)
    
  sentence_1 = list(a['sentence1'])
  sentence_2 = list(b['sentence2'])
  raw_data_train = {'sentence1_{}'.format(pairs[0]): sentence_1, 
              'sentence2_{}'.format(pairs[1]): sentence_2,
          'label': lab}
  df = pd.DataFrame(raw_data_train, columns = ['sentence1_{}'.format(pairs[0]),'sentence2_{}'.format(pairs[1]),'label'])
  return df

def get_features(lang_pair):
  features = {}
  
  features[lang_pair+'_train'] = 0
  features[lang_pair+'_test'] = 0
  lang_dict = {'fr':[df_fr_train,df_fr_test],'de':[df_de_train,df_de_test],'tr':[df_tr_train,df_tr_test],'bg':[df_bg_train,df_bg_test]}
  pairs = lang_pair.split('-')
  lang1_train,lang2_train = lang_dict[pairs[0]][0],lang_dict[pairs[1]][0]
  lang1_test,lang2_test = lang_dict[pairs[0]][1],lang_dict[pairs[1]][1]
  features[lang_pair+'_train'] = get_data(lang1_train,lang2_train,lang_pair)
  features[lang_pair+'_test'] = get_data(lang1_test,lang2_test,lang_pair)
  features[lang_pair+'_train'] = features[lang_pair+'_train'].apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x['sentence1_{}'.format(pairs[0])], 
                                                                   text_b = x['sentence2_{}'.format(pairs[1])], 
                                                                   label = x['label']), axis = 1)
  features[lang_pair+'_test'] = features[lang_pair+'_test'].apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x['sentence1_{}'.format(pairs[0])], 
                                                                   text_b = x['sentence2_{}'.format(pairs[1])], 
                                                                   label = x['label']), axis = 1)
  vocab_file = "multi_cased_L-12_H-768_A-12/vocab.txt"
  label_list = [0,1,2,3]
  tokenizer = bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=True)
  MAX_SEQ_LENGTH = 128

  features[lang_pair+'_train'] = bert.run_classifier.convert_examples_to_features(features[lang_pair+'_train'], label_list, MAX_SEQ_LENGTH, tokenizer)
  features[lang_pair+'_test'] = bert.run_classifier.convert_examples_to_features(features[lang_pair+'_test'], label_list, MAX_SEQ_LENGTH, tokenizer)
  return features

In [0]:
features = get_features('fr-bg')

In [0]:
features.keys()

# CLTE-BERT Custom Model Definition

In [0]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
  """Creates a classification model."""
  model = bert.run_classifier.modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use model.get_sequence_output()
  # instead.
  output_layer = model.get_pooled_output()
  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities,predicted_labels,output_layer)




def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities,predicted_labels,hidden_context) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
        num_labels, use_one_hot_embeddings)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = bert.run_classifier.modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    """
    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)
    """
    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(
            labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
       
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss
        }

      eval_metrics = metric_fn(per_example_loss, label_ids, logits, is_real_example)
      
      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metric_ops=eval_metrics)
    else:
      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities,"labels": predicted_labels, "hidden_context": hidden_context})
    return output_spec

  return model_fn

# CLTE-Progressive-BERT Custom Model Definition

In [0]:
def create_model_progressive(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings,hidden_context):
  """Creates a classification model."""
  model = bert.run_classifier.modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use model.get_sequence_output()
  # instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)


    output_layer_probs = tf.nn.softmax(output_layer,axis = -1)
    #loss = y_true * log(y_true / y_pred)
    hidden_context = tf.nn.softmax(hidden_context,axis = -1)
    per_example_kd_loss = tf.keras.losses.KLD(hidden_context,output_layer_probs)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)

    kd_loss_weight = 0.5 #hyperparameter
    per_example_kd_loss = kd_loss_weight*per_example_kd_loss

    per_example_loss += per_example_kd_loss

    

    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities,predicted_labels)




def model_fn_builder_progressive(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    hidden_context = features["hidden_context"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities,predicted_labels) = create_model_progressive(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
        num_labels, use_one_hot_embeddings,hidden_context)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = bert.run_classifier.modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    """
    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    """
    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(
            labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss,
        }

      eval_metrics = metric_fn(per_example_loss, label_ids, logits, is_real_example)
      
      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metric_ops=eval_metrics)
    else:
      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities,"labels": predicted_labels})
    return output_spec

  return model_fn

# Input Functions

1.   CLTE-BERT
2.   CLTE-BERT with Image
3.   CLTE-BERT-Progressive with Image



In [0]:
def input_fn_builder(features, hidden_context,seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)
    hidden_shape = hidden_context.shape[-1]
    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),

        "hidden_context":
            tf.constant(hidden_context, shape = [num_examples,hidden_shape], dtype = tf.float32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn



def input_fn_builder_img(img_features,features,seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)
    hidden_shape_img = img_features.shape[-1]
    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),

        "img_features":
            tf.constant(img_features, shape = [num_examples,hidden_shape_img], dtype = tf.float32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn


def input_fn_builder_pr_img(img_features,features,hidden_context,seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)
    hidden_shape_img = img_features.shape[-1]
    hidden_shape = hidden_context.shape[-1]
    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),

        "img_features":
            tf.constant(img_features, shape = [num_examples,hidden_shape_img], dtype = tf.float32),

        "hidden_context":
            tf.constant(hidden_context, shape = [num_examples,hidden_shape], dtype = tf.float32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

# Trainer Functions for BERT (With and Without Image)

In [0]:
Epochs = 5 # Number of Training Epochs 


def train(output_dir,input_fn,input_fn_builder_progressive = False,hidden_context = None):
  CONFIG_FILE = "multi_cased_L-12_H-768_A-12/bert_config.json"
  INIT_CHECKPOINT = "multi_cased_L-12_H-768_A-12/bert_model.ckpt"

  BATCH_SIZE = 28
  LEARNING_RATE = 2e-5
  NUM_TRAIN_EPOCHS = Epochs
  # Warmup is a period of time where hte learning rate 
  # is small and gradually increases--usually helps training.
  WARMUP_PROPORTION = 0.1
  # Model configs
  SAVE_CHECKPOINTS_STEPS = 15000
  SAVE_SUMMARY_STEPS = 100
  OUTPUT_DIR = output_dir
  # Compute # train and warmup steps from batch size
  num_train_steps = int(len(input_fn) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
  print(num_train_steps)
  run_config = tf.estimator.RunConfig(
      model_dir=OUTPUT_DIR,
      save_summary_steps=SAVE_SUMMARY_STEPS,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

  # Specify outpit directory and number of checkpoint steps to save
  if input_fn_builder_progressive==False:
  


    model_fn = model_fn_builder(
      bert_config=bert.run_classifier.modeling.BertConfig.from_json_file(CONFIG_FILE),
      num_labels=4, #number of unique labels
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=False,
      use_one_hot_embeddings=False
    )



    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})

  
  
    train_input_fn = bert.run_classifier.input_fn_builder(
        features=input_fn,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)
    

  else:

    model_fn_pr = model_fn_builder_progressive(
      bert_config=bert.run_classifier.modeling.BertConfig.from_json_file(CONFIG_FILE),
      num_labels=4, #number of unique labels
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=False,
      use_one_hot_embeddings=False
    )



    estimator = tf.estimator.Estimator(
      model_fn=model_fn_pr,
      config=run_config,
      params={"batch_size": BATCH_SIZE})

  
    train_input_fn = input_fn_builder(
        features=input_fn,
        hidden_context=hidden_context,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

  print(f'Beginning Training!')
  %timeit

  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  return estimator


def train_img(img_features,output_dir,input_fn,input_fn_builder_progressive = False,hidden_context = None):
  CONFIG_FILE = "multi_cased_L-12_H-768_A-12/bert_config.json"
  INIT_CHECKPOINT = "multi_cased_L-12_H-768_A-12/bert_model.ckpt"

  BATCH_SIZE = 28
  LEARNING_RATE = 2e-5
  NUM_TRAIN_EPOCHS = Epochs              
  # Warmup is a period of time where hte learning rate 
  # is small and gradually increases--usually helps training.
  WARMUP_PROPORTION = 0.1
  # Model configs
  SAVE_CHECKPOINTS_STEPS = 15000
  SAVE_SUMMARY_STEPS = 100
  OUTPUT_DIR = output_dir
  # Compute # train and warmup steps from batch size
  num_train_steps = int(len(input_fn) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
  print(num_train_steps)
  run_config = tf.estimator.RunConfig(
      model_dir=OUTPUT_DIR,
      save_summary_steps=SAVE_SUMMARY_STEPS,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

  # Specify outpit directory and number of checkpoint steps to save
  if input_fn_builder_progressive==False:
  


    model_fn = model_fn_builder_img(
      bert_config=bert.run_classifier.modeling.BertConfig.from_json_file(CONFIG_FILE),
      num_labels=4, #number of unique labels
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=False,
      use_one_hot_embeddings=False
    )



    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})

  
  
    train_input_fn = input_fn_builder_img(
        img_features = img_features,
        features=input_fn,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)
    

  else:

    model_fn_pr = model_fn_builder_img_progressive(
      bert_config=bert.run_classifier.modeling.BertConfig.from_json_file(CONFIG_FILE),
      num_labels=4, #number of unique labels
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=False,
      use_one_hot_embeddings=False
    )



    estimator = tf.estimator.Estimator(
      model_fn=model_fn_pr,
      config=run_config,
      params={"batch_size": BATCH_SIZE})

  
    train_input_fn = input_fn_builder_pr_img(
        img_features = img_features,
        features=input_fn,
        hidden_context=hidden_context,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

  print(f'Beginning Training!')
  %timeit

  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  return estimator

# Evaluation Functions for BERT (With and Without Image)

*   CTX = 0 for English Premise and Hindi Hypothesis
*   CTX = 1 for Hindi Premise and English Hypothesis



In [0]:
from sklearn.metrics import accuracy_score
def evaluate_and_get_hidden_context(ctx,estimator,input_fn_for_test,input_fn_for_hidden,is_progressive = False,hidden_context=None):
  MAX_SEQ_LENGTH = 128
  if not is_progressive:
    test_input_fn = run_classifier.input_fn_builder(
      features=input_fn_for_test,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=False)
    actual_labels = []
    if ctx ==0:
      for i in test_eng_hindi['label']:
        actual_labels.append(i)
    elif ctx==1:
      for i in test_hindi_eng['label']:
        actual_labels.append(i)

    res = estimator.predict(test_input_fn)
    predicted_labels = []

    for i in res:
      predicted_labels.append(i['labels'])
    print(f'acc {accuracy_score(actual_labels,predicted_labels)} ')
    estimator.evaluate(input_fn=test_input_fn, steps=None)
    hidden_input_fn = run_classifier.input_fn_builder(
        features=input_fn_for_hidden,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    estimator.evaluate(input_fn=hidden_input_fn, steps=None)
    res_ = estimator.predict(hidden_input_fn)
    hidden_context = []
    k = 0
    try:
      for i in res_:
        #print(i['hidden_context'])
        
        hidden_context.append(i["hidden_context"])
        k+=1
    except:
      print(f'k is {k}')
    hidden_context = np.array(hidden_context)
    return hidden_context,actual_labels,predicted_labels
  else:
    test_input_fn = input_fn_builder(
      features=input_fn_for_test,
      hidden_context=hidden_context,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=False)
    estimator.evaluate(input_fn=test_input_fn, steps=None)
    actual_labels = []
    if ctx ==0:
      for i in test_eng_hindi['label']:
        actual_labels.append(i)
    elif ctx==1:
      for i in test_hindi_eng['label']:
        actual_labels.append(i)

    res = estimator.predict(test_input_fn)
    predicted_labels = []

    for i in res:
      predicted_labels.append(i['labels'])
    return actual_labels,predicted_labels



#IMG
def evaluate_and_get_hidden_context_img(ctx,img_features_for_test,img_features,estimator,input_fn_for_test,input_fn_for_hidden,is_progressive = False,hidden_context=None):
  MAX_SEQ_LENGTH = 128
 
  if not is_progressive:
    test_input_fn = input_fn_builder_img(
      features=input_fn_for_test,
      img_features = img_features_for_test,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=False)
    actual_labels = []
    if ctx ==0:
      for i in test_eng_hindi['label']:
        actual_labels.append(i)
    elif ctx==1:
      for i in test_hindi_eng['label']:
        actual_labels.append(i)
    res = estimator.predict(test_input_fn)
    predicted_labels = []
    for i in res:
      predicted_labels.append(i['labels'])
    estimator.evaluate(input_fn=test_input_fn, steps=None)
    hidden_input_fn = input_fn_builder_img(
        features=input_fn_for_hidden,
        img_features = img_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    
    res = estimator.predict(hidden_input_fn)
    hidden_context = []
    for i in res:
      hidden_context.append(i["hidden_context"])
    hidden_context = np.array(hidden_context)
    return hidden_context, actual_labels,predicted_labels
  else:
    test_input_fn = input_fn_builder_pr_img(
      img_features = img_features_for_test,
      features=input_fn_for_test,
      hidden_context=hidden_context,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=False)
    estimator.evaluate(input_fn=test_input_fn, steps=None)
    actual_labels = []
    if ctx ==0:
      for i in test_eng_hindi['label']:
        actual_labels.append(i)
    elif ctx==1:
      for i in test_hindi_eng['label']:
        actual_labels.append(i)

    res = estimator.predict(test_input_fn)
    predicted_labels = []

    for i in res:
      predicted_labels.append(i['labels'])
    return actual_labels,predicted_labels

# Progressive Training on Chosen Premise and Hypothesis 

In [0]:
#get hidden context from drive
hidden_context_data = np.load('/content/gdrive/My Drive/XNLI Hidden Contexts/Hidden_Context_fr-de_Normal.npy', allow_pickle=True)
#get features using get_features
features = get_features(lang_pair)
estimator = train('out_dir_train_eng_pro',features[0],input_fn_builder_progressive = True, hidden_context = hidden_context_data)

In [0]:
Test_batch_size = 500
dummy = np.random.randn(Test_batch_size,768)
act_lab, pred_lab = evaluate_and_get_hidden_context(0,estimator,input_fn_for_test = features[1],input_fn_for_hidden = features[0],is_progressive = True,hidden_context=dummy)

#Classification Report
target_names = ['Contradiction', 'Neutral', 'Entailment','Other']
print(classification_report(act_lab, pred_lab, target_names=target_names))