In [None]:
SEED = 42
from google.colab import drive
from google.colab import files

%tensorflow_version 1.x
import tensorflow as tf

!pip install bert-tensorflow==1.0.1
import bert
from bert import run_classifier

from bert import optimization
from bert import tokenization

import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
file = '/content/bin_train.xlsx'      
train_df = pd.read_excel(file)

In [3]:
# Loading Validation Data (Run when evaluating for Validation Data)
file = '/content/Validation.xlsx'       # Cleaned Validation Data (Obtained After Preprocessing the Original Validation Data from Organizers)
test_df = pd.read_excel(file)

In [None]:
# Loading Test Data (Run when evaluating for Test Data)
file = '/content/Hindi_Test.xlsx'       # Cleaned Test Data (Obtained After Preprocessing the Original Test Data from Organizers)
test_df = pd.read_excel(file,names=['Row','Unique ID','Post'])

In [None]:
# Data Preparation into Pandas Dataframe for Model Input
def get_data(a):
  Unique_ID = list(a['Unique ID'])
  sentence = list(a['Post'])

  # Appending Dummy Labels as Labels are not needed
  label = []
  for i in Unique_ID:
    label.append(0)    

  
  raw_data_train = {'UID':Unique_ID,'sentence': sentence, 'label': label}
  df = pd.DataFrame(raw_data_train, columns = ['UID','sentence','label'])
  return df

train_data = get_data(train_df)
test_data  = get_data(test_df)

print(train_data[0:3])
print(test_data[0:3])

In [None]:
# Loading mBERT Model

!wget https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
!unzip multilingual_L-12_H-768_A-12.zip

In [39]:
label_list = [0,1]

train_InputExamples = train_data.apply(lambda x: bert.run_classifier.InputExample(guid = x['UID'],
                                                                                  text_a = x['sentence'],  
                                                                                  label = x['label']), axis = 1)

test_InputExamples = test_data.apply(lambda x: bert.run_classifier.InputExample(guid = x['UID'],
                                                                                text_a = x['sentence'],  
                                                                                label = x['label']), axis = 1)

In [40]:
vocab_file = "multilingual_L-12_H-768_A-12/vocab.txt"

def create_tokenizer_from_hub_module():
  return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

tokenizer = create_tokenizer_from_hub_module()
# Check Tokenizer
print(tokenizer.tokenize("धीरे"))

['ध', '##ीर']


In [None]:
# Convert train and test features to InputFeatures that BERT understands.
MAX_SEQ_LENGTH = 128
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

# Binary Classification Model

In [42]:
# Binary Classification Model
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings):
  """Creates a classification model."""
  model = bert.run_classifier.modeling.BertModel(config=bert_config,
                                                 is_training=is_training,
                                                 input_ids=input_ids,
                                                 input_mask=input_mask,
                                                 token_type_ids=segment_ids,
                                                 use_one_hot_embeddings=use_one_hot_embeddings)
  #Sizes
  output_layer = model.get_pooled_output()
  pooled_output = model.get_pooled_output()
  token_output = model.get_sequence_output()
  hidden_size = output_layer.shape[-1].value
  
  #Trainable Parameters
  output_weights = tf.get_variable("output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02))
  output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)  # dropout = 0.1 

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities,predicted_labels,output_layer,token_output,pooled_output)


def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""
    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities,predicted_labels,hidden_context,token_outputs,pooled_output) = create_model(bert_config, 
                                                                                                         is_training, 
                                                                                                         input_ids, 
                                                                                                         input_mask, 
                                                                                                         segment_ids, 
                                                                                                         label_ids,
                                                                                                         num_labels, 
                                                                                                         use_one_hot_embeddings)
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names) = bert.run_classifier.modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:
        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()
        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    output_spec = None
    
    if mode == tf.estimator.ModeKeys.TRAIN:
      train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
      output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:
      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
        return {"eval_accuracy": accuracy, "eval_loss": loss}
      eval_metrics = metric_fn(per_example_loss, label_ids, logits, is_real_example)
      output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, eval_metric_ops=eval_metrics)
    else:
      output_spec = tf.estimator.EstimatorSpec(mode=mode, 
                                               predictions={"probabilities": probabilities,
                                                            "labels": predicted_labels, 
                                                            "hidden_context": hidden_context,
                                                            "token_output": token_outputs,
                                                            "pooled_output": pooled_output})
    return output_spec
  return model_fn

In [43]:
#Input Functions
def input_fn_builder(features, hidden_context, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]
    num_examples = len(features)
    hidden_shape = hidden_context.shape[-1]
    d = tf.data.Dataset.from_tensor_slices({"input_ids": tf.constant(all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32),
                                            "input_mask": tf.constant(all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32),
                                            "segment_ids": tf.constant(all_segment_ids, shape=[num_examples, seq_length], dtype=tf.int32),
                                            "label_ids": tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
                                            "hidden_context": tf.constant(hidden_context, shape = [num_examples,hidden_shape], dtype = tf.float32),
                                           })
    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d
  return input_fn

In [44]:
# Trainer Function

Epochs = 10 # Number of Training Epochs 

def train(output_dir,input_fn):
  CONFIG_FILE = "multilingual_L-12_H-768_A-12/bert_config.json"
  INIT_CHECKPOINT = "multilingual_L-12_H-768_A-12/bert_model.ckpt"

  BATCH_SIZE = 28
  LEARNING_RATE = 2e-5
  NUM_TRAIN_EPOCHS = Epochs
  WARMUP_PROPORTION = 0.1 # Warmup is a period of time where hte learning rate is small and gradually increases--usually helps training.
  # Model configs
  SAVE_CHECKPOINTS_STEPS = 6000
  SAVE_SUMMARY_STEPS = 100
  OUTPUT_DIR = output_dir
  # Compute train and warmup steps from batch size
  num_train_steps = int(len(input_fn) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
  print(num_train_steps)
  run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR,save_summary_steps=SAVE_SUMMARY_STEPS,save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

  # Specify outpit directory and number of checkpoint steps to save
  model_fn = model_fn_builder(bert_config=bert.run_classifier.modeling.BertConfig.from_json_file(CONFIG_FILE),
                              num_labels = 4, #number of unique labels
                              init_checkpoint=INIT_CHECKPOINT,
                              learning_rate=LEARNING_RATE,
                              num_train_steps=num_train_steps,
                              num_warmup_steps=num_warmup_steps,
                              use_tpu=False,
                              use_one_hot_embeddings=False)

  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={"batch_size": BATCH_SIZE})
  train_input_fn = bert.run_classifier.input_fn_builder(features=input_fn, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=False)

  print(f'Beginning Training!')
  %timeit

  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  return estimator

In [45]:
# Function to get Train and Validation Proabilities for Ensemble


#Evaluation Function for Train Data
def evaluate_for_train(estimator,input_fn_for_train):
  MAX_SEQ_LENGTH = 128
  test_input_fn = run_classifier.input_fn_builder(features=input_fn_for_train,seq_length=MAX_SEQ_LENGTH,is_training=False,drop_remainder=False)
  
  actual_labels = []
  for i in test_data['label']:
    actual_labels.append(i)
    
  res = estimator.predict(test_input_fn)
  predicted_labels = []
  probabilities = []
  for i in res:
    predicted_labels.append(i['labels'])
    probabilities.append(i['probabilities'])
  estimator.evaluate(input_fn=test_input_fn, steps=None)
  return actual_labels,predicted_labels,probabilities

#Evaluation Function for Validation Data
def evaluate_for_val_data(estimator,input_fn_for_test):
  MAX_SEQ_LENGTH = 128
  test_input_fn = run_classifier.input_fn_builder(features=input_fn_for_test,seq_length=MAX_SEQ_LENGTH,is_training=False,drop_remainder=False)
  
  actual_labels = []
  for i in test_data['label']:
    actual_labels.append(i)

  res = estimator.predict(test_input_fn)
  predicted_labels = []
  probabilities = []
  for i in res:
    predicted_labels.append(i['labels'])
    probabilities.append(i['probabilities'])
  estimator.evaluate(input_fn=test_input_fn, steps=None)
  return actual_labels,predicted_labels,probabilities

#Evaluation Function for Test Data
def evaluate_for_test_data(estimator,input_fn_for_test):
  MAX_SEQ_LENGTH = 128
  test_input_fn = run_classifier.input_fn_builder(features=input_fn_for_test,seq_length=MAX_SEQ_LENGTH,is_training=False,drop_remainder=False)
  
  actual_labels = []
  for i in test_data['label']:
    actual_labels.append(i)
  print('Len of Act Lab = ',len(actual_labels))
  res = estimator.predict(test_input_fn)
  predicted_labels = []
  probabilities = []
  l = 0
  for i in res:
    #print(l)
    if (l == 1651):
      break
    predicted_labels.append(i['labels'])
    probabilities.append(i['probabilities'])
    l += 1
  predicted_labels.append(predicted_labels[1650])
  probabilities.append(probabilities[1650])
  predicted_labels.append(predicted_labels[1650])
  probabilities.append(probabilities[1650])
  estimator.evaluate(input_fn=test_input_fn, steps=None)
  return actual_labels,predicted_labels,probabilities

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
# Use Link provided in ReadMe(Result).txt and download the FineTuned Weights and Load (Change paths accordingly)

# Pretrained Model Loading

# To Move the trained folder from drive to Local Machine
# Mount Drive First
!cp -a '/content/drive/MyDrive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Weights/BERT_Coarse_Train' '/content/BERT_Training_Coarse' 

In [None]:
# Train
estimator = train('BERT_Training_Coarse', train_features)

In [None]:
# Evaluate for Validation (Run while Creating Results for Validation Dataset Only)
act_lab, pred_lab, test_prob = evaluate_for_val_data(estimator, test_features)

In [None]:
# Evaluate for Test (Run while Creating Results for Test Dataset Only)
act_lab, pred_lab, test_prob = evaluate_for_test_data(estimator, test_features)

In [None]:
# Saving Probabilities
test_prob = np.array(test_prob).reshape(len(test_prob),4)
print(test_prob.shape)

np.array(test_prob).dump(open('Test_Probs_Coarse_mBERT.npy', 'wb'))