In [1]:
# !pip install tensorflow-gpu==1.15
# !pip install tf_slim==1.1.0
# !pip install nltk>=3.5
# !pip install tqdm>=4.50.1
# !pip install numpy>=1.13.3

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd /content/gdrive/MyDrive/Colab Notebooks/13 - NLP/Project

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Colab Notebooks/13 - NLP/Project


In [3]:
import nltk
import pickle, random
from smith import smith_processing
from smith.config.config import dictConveter, encoder_config_dict
from sklearn.metrics import accuracy_score
from smith.bert import tokenization
import pandas as pd
import json
import time
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# SMITH Modeling

## Config

In [4]:
encoder_config_dict['encoder_config']

{'add_masks_lm': False,
 'bert_config_file': 'smith/config/sent_bert_4l_config.json',
 'doc_bert_config_file': 'smith/config/doc_bert_3l_256h_config.json',
 'doc_rep_combine_attention_size': 256,
 'doc_rep_combine_mode': 'normal',
 'init_checkpoint': 'smith/pretrained_model/SMITHWPSP/model.ckpt-400000',
 'loop_sent_number_per_doc': 64,
 'max_doc_length_by_sentence': 64,
 'max_masked_sent_per_doc': 0,
 'max_predictions_per_seq': 0,
 'max_sent_length_by_word': 32,
 'model_name': 'smith_dual_encoder',
 'predict_checkpoint': 'smith/pretrained_model/SMITHWPSP/model.ckpt-400000',
 'sent_bert_trainable': True,
 'use_masked_sentence_lm_loss': False}

In [5]:
encoder_config_dict['train_eval_config']

{'batch_size': 32,
 'gcp_project': None,
 'learning_rate': 5e-05,
 'master': None,
 'model_output_dir': 'smith/pretrained_model/export/',
 'neg_to_pos_example_ratio': 1.0,
 'num_tpu_cores': 8,
 'num_train_steps': 2,
 'num_warmup_steps': 1,
 'pred_output_file': 'smith/data/output/prediction_results.json',
 'processed_file': 'smith/data/processed_data_test.tfrecord',
 'save_checkpoints_steps': 10000000000.0,
 'tpu_name': None,
 'tpu_zone': None,
 'use_tpu': False}

In [6]:
encoder_config_dict['encoder_config']['max_sent_length_by_word'] = 32
encoder_config_dict['encoder_config']['max_doc_length_by_sentence'] = 64
# encoder_config_dict['encoder_config']['max_predictions_per_seq'] = 5
encoder_config_dict['train_eval_config']['processed_file'] = 'smith/data/processed_data_masked_train.tfrecord'

# convert to class attribute and save for model training
encoder_config = dictConveter(encoder_config_dict)
with open('smith/config/config.pickle', 'wb') as f:
  pickle.dump(encoder_config, f)

## Data Procesing

In [None]:
vocab_file = 'smith/config/vocab.txt'
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
vocab_words = list(tokenizer.vocab.keys())
rng = random.Random(12345)
add_masks_lm = encoder_config_dict['encoder_config']['add_masks_lm']
if add_masks_lm:
  masked_lm_prob = 0.15
else:
  masked_lm_prob = 0
# max_predictions_per_seq = int(encoder_config_dict['encoder_config']['max_sent_length_by_word'] * masked_lm_prob)
max_predictions_per_seq = 5
train_mode = 'finetune'

In [None]:
df_record = pd.read_csv('smith/data/wiki_doc_pair_train.csv')
df_record.dropna(subset=['doc_one', 'doc_two', 'doc_label'], inplace=True)
df_record['pair_id'] = df_record['pair_id'].astype(str)
df_record.head()

Unnamed: 0,doc_label,url_one,url_two,doc_one,doc_two,pair_id
0,1,http://en.wikipedia.org/wiki/Caitlin_Farrell,http://en.wikipedia.org/wiki/National_Women's_...,"Caitlin Farrell (born September 29, 1997) is a...",The National Women's Soccer League (NWSL) is a...,1623811144
1,0,http://en.wikipedia.org/wiki/ClamWin_Free_Anti...,http://en.wikipedia.org/wiki/McAfee_VirusScan,ClamWin Free Antivirus is a free and open-sour...,McAfee VirusScan is an antivirus program creat...,1623811145
2,1,http://en.wikipedia.org/wiki/Suicide_legislation,http://en.wikipedia.org/wiki/Bullying_and_suicide,Suicide is a crime in some parts of the world....,"Bullying is an undesirable, attacker behavior ...",1623811146
3,0,http://en.wikipedia.org/wiki/Arrow_(season_2),http://en.wikipedia.org/wiki/Thea_Queen,The second season of the American television s...,"Thea Dearden Queen, also known as Speedy, is a...",1623811147
4,1,http://en.wikipedia.org/wiki/USS_Barry_(DD-248),http://en.wikipedia.org/wiki/USS_Chandler_(DD-...,Barry (DD-248/APD-29) was a Clemson-class dest...,USS Chandler (DD-206/DMS-9/AG-108) was a Clem...,1623811148


In [None]:
# required two doc for processing, create dow_two as dummy if only doc_one exists
instances = []
for i, row in df_record.iterrows():
  doc_one_tokens, _ = smith_processing.get_smith_model_tokens(row['doc_one'], tokenizer, [0, 0])
  doc_two_tokens, _ = smith_processing.get_smith_model_tokens(row['doc_two'], tokenizer, [0, 0])

  if not doc_one_tokens or not doc_two_tokens:
    continue

  instances.append(
      smith_processing.create_instance_from_wiki_doc_pair(
          instance_id=row['pair_id'], 
          doc_match_label=row['doc_label'],
          doc_one_tokens=doc_one_tokens, 
          doc_two_tokens=doc_two_tokens,
          max_sent_length_by_word=encoder_config_dict['encoder_config']['max_sent_length_by_word'], 
          max_doc_length_by_sentence=encoder_config_dict['encoder_config']['max_doc_length_by_sentence'], 
          masked_lm_prob=masked_lm_prob,
          max_predictions_per_seq=max_predictions_per_seq, 
          vocab_words=list(tokenizer.vocab.keys()), 
          rng=rng,
          tokenizer=tokenizer,
          ))

# rng.shuffle(instances)
features = smith_processing.transform_features(instances=instances, tokenizer=tokenizer)
smith_processing.write_instance_to_example_files(instances=instances, 
                                                 tokenizer=tokenizer, 
                                                 output_files=encoder_config_dict['train_eval_config']['processed_file'],
                                                 )

In [None]:
print(instances[0])

instance_id: 1623811144
documents_match_labels: 1
tokens_1: [CLS] caitlin [MASK] ( born september 29 , 1997 ) is a former american soccer player who played professionally as a forward for orlando pride in the nw ##sl . [SEP] [SEP] [CLS] early life farrell [MASK] a three - sport athlete at cho ##ate rosemary hall , playing soccer , basketball and lacrosse . [SEP] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] an all - america selection by the national soccer [MASK] association of america ( ns ##ca ##a ) in 2014 , she was selected to play in the high [SEP] [SEP] [CLS] she also played club soccer with connecticut fc ec ##nl and the [MASK] i olympic development program . [SEP] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] georgetown ho ##yas farrell played collegiate soccer at georgetown university where she won three consecutive big east conference titles and made two appearances at the ncaa [MASK] ' s [SEP] [SEP] [CLS] in [MASK] senior year , farrel

## Fine Tuning

In [7]:
# define the output dir for fine tune model
encoder_config_dict['train_eval_config']['model_output_dir'] = 'smith/pretrained_model/export/'
encoder_config_dict['train_eval_config']['num_train_steps'] = 30000
encoder_config_dict['train_eval_config']['num_warmup_steps'] = 1000
encoder_config_dict['encoder_config']['sent_bert_trainable'] = True
encoder_config_dict['train_eval_config']['processed_file'] = 'smith/data/processed_data_masked_train.tfrecord'
# encoder_config_dict['train_eval_config']['learning_rate'] = 5e-05

# convert to class attribute and save for model training
encoder_config = dictConveter(encoder_config_dict)
with open('smith/config/config.pickle', 'wb') as f:
  pickle.dump(encoder_config, f)

In [None]:
start = time.time()
!python -m smith.run_smith --dual_encoder_config_file=smith/config/config.pickle --schedule=train --train_mode=finetune
print(time.time() - start)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:global_step/sec: 1.20082
I0624 07:24:28.736545 140623216289664 tpu_estimator.py:2307] global_step/sec: 1.20082
INFO:tensorflow:examples/sec: 38.4262
I0624 07:24:28.736937 140623216289664 tpu_estimator.py:2308] examples/sec: 38.4262
INFO:tensorflow:global_step/sec: 1.19431
I0624 07:24:29.573807 140623216289664 tpu_estimator.py:2307] global_step/sec: 1.19431
INFO:tensorflow:examples/sec: 38.218
I0624 07:24:29.574214 140623216289664 tpu_estimator.py:2308] examples/sec: 38.218
INFO:tensorflow:global_step/sec: 1.19476
I0624 07:24:30.410803 140623216289664 tpu_estimator.py:2307] global_step/sec: 1.19476
INFO:tensorflow:examples/sec: 38.2322
I0624 07:24:30.411191 140623216289664 tpu_estimator.py:2308] examples/sec: 38.2322
INFO:tensorflow:global_step/sec: 1.20847
I0624 07:24:31.238313 140623216289664 tpu_estimator.py:2307] global_step/sec: 1.20847
INFO:tensorflow:examples/sec: 38.671
I0624 07:24:31.238717 1406232

## Document Embedding

In [10]:
# change the predict chpkt to the fine tuned model and the final output saved location
encoder_config_dict['encoder_config']['predict_checkpoint'] = 'smith/pretrained_model/export/model.ckpt-30000'
# encoder_config_dict['encoder_config']['predict_checkpoint'] = 'smith/pretrained_model/SMITHWPSP/model.ckpt-400000'
encoder_config_dict['train_eval_config']['pred_output_file'] = 'smith/data/output/prediction_results.json'
encoder_config_dict['train_eval_config']['processed_file'] = 'smith/data/processed_data_test.tfrecord'
encoder_config_dict['encoder_config']['sent_bert_trainable'] = False

# convert to class attribute and save for model training
encoder_config = dictConveter(encoder_config_dict)
with open('smith/config/config.pickle', 'wb') as f:
  pickle.dump(encoder_config, f)

In [11]:
# set schedule to predict
!python -m smith.run_smith --dual_encoder_config_file=smith/config/config.pickle --train_mode=finetune --schedule=predict


W0624 03:37:17.861091 140418059741056 module_wrapper.py:139] From /content/gdrive/My Drive/Colab Notebooks/13 - NLP/Project/smith/run_smith.py:90: The name tf.estimator.tpu.InputPipelineConfig is deprecated. Please use tf.compat.v1.estimator.tpu.InputPipelineConfig instead.


W0624 03:37:17.861358 140418059741056 module_wrapper.py:139] From /content/gdrive/My Drive/Colab Notebooks/13 - NLP/Project/smith/run_smith.py:91: The name tf.estimator.tpu.RunConfig is deprecated. Please use tf.compat.v1.estimator.tpu.RunConfig instead.


W0624 03:37:17.861498 140418059741056 module_wrapper.py:139] From /content/gdrive/My Drive/Colab Notebooks/13 - NLP/Project/smith/run_smith.py:96: The name tf.estimator.tpu.TPUConfig is deprecated. Please use tf.compat.v1.estimator.tpu.TPUConfig instead.


W0624 03:37:17.861814 140418059741056 module_wrapper.py:139] From /content/gdrive/My Drive/Colab Notebooks/13 - NLP/Project/smith/run_smith.py:111: The name tf.estimator.tpu.TPUEstimator is deprecated. Please

In [12]:
# the model store variable in json fie during training to avoid exploding of RAM
with open(encoder_config_dict['train_eval_config']['pred_output_file']) as f:
  data = json.load(f)

In [13]:
data[20]

{'documents_match_labels': '0.0',
 'input_sent_embed_1': '0.09292083,0.02544577,0.0,0.0,0.0,0.0034819078,0.05004858,0.121089816,0.0,0.0008553202,0.0,0.0,0.0,0.036140736,0.0,0.0,0.0,0.0,0.0,0.057585407,0.0,0.0,0.0,0.16339095,0.0,0.0,0.01313854,0.0240252,0.0,0.041658744,0.004972238,0.0,0.055396806,0.0,0.0,0.035663635,0.0,0.25518054,0.0,0.0,0.11363055,0.04049609,0.0,0.014964099,0.046285868,0.11860275,0.0,0.0,0.0,0.041375194,0.26038766,0.0,0.0,0.0,0.0,0.1309543,0.10385044,0.011932806,0.0,0.0,0.0,0.062287685,0.0,0.0,0.12430599,0.1865007,0.06166808,0.0933845,0.0,0.025369314,0.0,0.117355295,0.0,0.08676425,0.077839874,0.0,0.1814702,0.039769016,0.0,0.0,0.14988305,0.08369023,0.0,0.0,0.11596614,0.0,0.0,0.0,0.020605627,0.0,0.033429593,0.022828603,0.03990387,0.0,0.060282975,0.0,0.0,0.060790207,0.0,0.0,0.0181457,0.0,0.0,0.0010326685,0.0,0.03710154,0.028445382,0.059406992,0.0,0.0,0.035610355,0.0,0.0,0.0,0.038088165,0.0,0.0,0.023391288,0.005541835,0.0,0.0,0.0,0.0,0.059982825,0.0315301,0.0,0.08846215,0

In [14]:
label = [x['documents_match_labels'] for x in data]
pred = [x['predicted_class'] for x in data]
score = [x['predicted_score'] for x in data]

In [15]:
accuracy_score(label, pred)

0.5508532423208191

# Downstream Task after SMITH

In [None]:
# 0.485
# 0.4
# 0.362
# 0.348

# 0.48
# 0.54