In [88]:
from pymongo import MongoClient
# from auth import username, password
import urllib
from pprint import pprint
from tqdm import tqdm
import tensorflow as tf
import multiprocessing
from transformers import AutoTokenizer, TFAutoModel
import pickle
import numpy as np
import os
import preprocessor as p
import collections
import random

seed = 1111

In [82]:
from transformers import (
   AutoConfig,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW,
   glue_convert_examples_to_features
)
# Choose model
# @markdown >The default model is <i><b>COVID-Twitter-BERT</b></i>. You can however choose <i><b>BERT Base</i></b> or <i><b>BERT Large</i></b> to compare these models to the <i><b>COVID-Twitter-BERT</i></b>. All these three models will be initiated with a random classification layer. If you go directly to the Predict-cell after having compiled the model, you will see that it still runs the predition. However the output will be random. The training steps below will finetune this for the specific task. <br /><br /> 
model_name = 'digitalepidemiologylab/covid-twitter-bert' #@param ["digitalepidemiologylab/covid-twitter-bert", "bert-large-uncased", "bert-base-uncased"]

# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [76]:
data_dir = '../src/data'
standardized_suffix = 'standardized'

In [77]:
def create_int_feature(values):
  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
  return feature

def create_float_feature(values):
  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
  return feature

In [78]:
def generate_onehot_labels():
    conspiracies = set()
    
    for filename in os.listdir('../src/data'):
        if 'standardized' in filename:
            conspiracies.add(filename.split('-')[0])
    return {consp:i for i, consp in enumerate(conspiracies)}
             
# generate_onehot_labels()

In [80]:
def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng):
  """Creates the predictions for the masked LM objective."""

  cand_indexes = []
  for (i, token) in enumerate(tokens):
    if token == vocab["[CLS]"] or token == vocab["[SEP]"]:
      continue
    # Whole Word Masking means that if we mask all of the wordpieces
    # corresponding to an original word. When a word has been split into
    # WordPieces, the first token does not have any marker and any subsequence
    # tokens are prefixed with ##. So whenever we see the ## token, we
    # append it to the previous set of word indexes.
    #
    # Note that Whole Word Masking does *not* change the training code
    # at all -- we still predict each WordPiece independently, softmaxed
    # over the entire vocabulary.
    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
        token.startswith("##")):
      cand_indexes[-1].append(i)
    else:
      cand_indexes.append([i])

  rng.shuffle(cand_indexes)

  output_tokens = list(tokens)

  num_to_predict = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))

  masked_lms = []
  covered_indexes = set()
  for index_set in cand_indexes:
    if len(masked_lms) >= num_to_predict:
      break
    # If adding a whole-word mask would exceed the maximum number of
    # predictions, then just skip this candidate.
    if len(masked_lms) + len(index_set) > num_to_predict:
      continue
    is_any_index_covered = False
    for index in index_set:
      if index in covered_indexes:
        is_any_index_covered = True
        break
    if is_any_index_covered:
      continue
    for index in index_set:
      covered_indexes.add(index)

      masked_token = None
      # 80% of the time, replace with [MASK]
      if rng.random() < 0.8:
        masked_token = "[MASK]"
      else:
        # 10% of the time, keep original
        if rng.random() < 0.5:
          masked_token = tokens[index]
        # 10% of the time, replace with random word
        else:
          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]

      output_tokens[index] = masked_token

      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
  assert len(masked_lms) <= num_to_predict
  masked_lms = sorted(masked_lms, key=lambda x: x.index)

  masked_lm_positions = []
  masked_lm_labels = []
  for p in masked_lms:
    masked_lm_positions.append(p.index)
    masked_lm_labels.append(p.label)

  return (output_tokens, masked_lm_positions, masked_lm_labels)

In [96]:
def convert_instance_to_examples(X, label, tokenizer):
    input_ids = X['input_ids']
    token_type_ids = X['token_type_ids']
    attention_mask = X['attention_mask']

    rng = random.Random(seed)
    masked_lm_prob = 0.1
    vocab = tokenizer.get_vocab()
    reverse_vocab = {v:k for k,v in vocab.items()}
    
    for i in tqdm(range(input_ids.shape[0])):  
        
        output_tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(input_ids[i], masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
        
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(np.array(input_ids[i]))
        features["input_mask"] = create_int_feature(np.array(token_type_ids[i]))
        features["segment_ids"] = create_int_feature(np.array(attention_mask[i]))
        features["masked_lm_positions"] = create_int_feature(np.array(attention_mask[i]))
        features["masked_lm_ids"] = create_float_feature(np.array(attention_mask[i]))
        features["masked_lm_weights"] = create_int_feature(np.array(attention_mask[i]))
        features["next_sentence_labels"] = create_int_feature(np.array(label))
        
        
        break

In [97]:
labels = generate_onehot_labels()

for filename in os.listdir('../src/data'):
    if 'standardized' in filename:
        with open('../src/data/'+filename, 'rb') as f:
            X = pickle.load(f)
            y = np.zeros(len(labels.keys()))
            y[labels[filename.split('-')[0]]] = 1
            y = tf.convert_to_tensor(y, dtype=tf.int64)           
            
            convert_instance_to_examples(X, y, tokenizer)
            
            
            
#             print(y)
            
            
            
        break

  0%|                                                                                                                                                                                                                                                                                                                                     | 0/10002 [00:00<?, ?it/s]
