# Load Library

In [None]:
%tensorflow_version 1.x
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
%tensorflow_version 1.x
!mkdir -p drive
!google-drive-ocamlfuse drive
import os
os.chdir("drive/electra-paper/")

In [None]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

# Main

## CS

In [None]:
FEATURES = ["title", "keywords", "abstract"]
MODEL = "model.ckpt-690" if "keywords" in FEATURES else "model.ckpt-740"
if "abstract" in FEATURES:
    max_seq_length = 512
elif len(FEATURES) == 2:
    max_seq_length = 256
else:
    max_seq_length = 128

hparams = {
    "task_names": ["cs-paper"],
    "features": FEATURES,
    "model_dir": "gs://paper/electra-cspaper-{}/".format("+".join(FEATURES)), 
    "preprocessed_data_dir": "gs://paper/electra-cspaper-{}/".format("+".join(FEATURES)),

    "model_size": "base",
    "max_seq_length": max_seq_length,
    "vocab_file": "gs://bert-eng/electra_base/vocab.txt",
    "init_checkpoint": "gs://bert-eng/electra_base/electra_base",
    "do_lower_case": True, 
    "keep_all_models": True,

    "do_train": True,
    "train_batch_size": 128,
    "num_train_epochs": 10.0,
    "save_checkpoints_steps": 1000, 
    "iterations_per_loop": 1000,
    "use_tfrecords_if_existing": False,

    "do_eval": True,
    "do_test": True,
    "eval_batch_size": 128,
    "predict_batch_size": 128,
    "results_txt": "gs://paper/electra-cspaper-{}/results.txt".format("+".join(FEATURES)),
    "results_pkl": "gs://paper/electra-cspaper-{}/results.pkl".format("+".join(FEATURES)),
    
    "use_tpu": True,
    "num_tpu_cores": 8,
    "tpu_name": TPU_ADDRESS,
}

hparams.update({
    "do_train": False,
    "do_eval": False,
    "do_test": False,
    "do_predict": True,
    "predict_checkpoint_path": "gs://paper/electra-cspaper-{}/_1/{}".format("+".join(FEATURES), MODEL),
    "predict_split": "train,dev",
})


import json
with open("paper_config.json", "w") as outfile:
    json.dump(hparams, outfile)

!python3 run_finetuning.py \
    --data-dir=cs-paper/ \
    --model-name=test \
    --hparams=paper_config.json

In [None]:
with open("predict_dev.pickle", "rb") as infile:
    import pickle
    predict = pickle.load(infile)
import numpy as np
probabilities = np.asarray([item['cs-paper_logits'] for item in predict  if item['task_id'] == 0])

from finetune.classification.classification_tasks import cs_labels
journal_2_index = {journal: i for i, journal in enumerate(cs_labels)}

import json
y_valid = []
for line in open("../data/cs-paper/valid.jsonl"):
    line = json.loads(line)
    do_skip = False
    for feature in hparams['features']:
        if feature not in line or type(line[feature]) != str or \
            len(line[feature]) < 1: do_skip = True
    if not do_skip:
        y_valid.append(journal_2_index[line['journal'].lower()])
y_valid = np.asarray(y_valid)


print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)
print()



with open("predict_train.pickle", "rb") as infile:
    import pickle
    predict = pickle.load(infile)

import numpy as np
probabilities = np.asarray([item['cs-paper_logits'] for item in predict  if item['task_id'] == 0])
np.save("cs-electra-{}-train.npy".format("+".join(hparams['features'])), probabilities)

import json
y_valid = []
for line in open("../data/cs-paper/train.jsonl"):
    line = json.loads(line)
    if "keywords" in FEATURES and ("keywords" not in line or type(line['keywords']) != str or len(line['keywords']) < 1): continue
    y_valid.append(journal_2_index[line['journal'].lower()])
y_valid = np.asarray(y_valid)

print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)
print()



## Springer

In [None]:
hparams = {
    "task_names": ["springer-paper"],
    "features": ["title", "keywords", "abstract"],
    "model_dir": "gs://paper/electra-springerpaper-title+keywords+abstract/", 
    "preprocessed_data_dir": "gs://paper/electra-springerpaper-title+keywords+abstract/",

    "model_size": "base",
    "max_seq_length": 512,
    "vocab_file": "gs://bert-eng/electra_base/vocab.txt",
    "init_checkpoint": "gs://bert-eng/electra_base/electra_base",
    "do_lower_case": True, 
    "keep_all_models": True,

    "do_train": True,
    "train_batch_size": 128,
    "num_train_epochs": 10.0,
    "save_checkpoints_steps": 1000, 
    "iterations_per_loop": 1000,
    "use_tfrecords_if_existing": False,

    "do_eval": True,
    "do_test": True,
    "eval_key": "top1_accuracy",
    "eval_batch_size": 128,
    "predict_batch_size": 128,
    "results_txt": "gs://paper/electra-springerpaper-title+keywords+abstract/results.txt",
    "results_pkl": "gs://paper/electra-springerpaper-title+keywords+abstract/results.pkl",
    
    "use_tpu": True,
    "num_tpu_cores": 8,
    "tpu_name": TPU_ADDRESS,
}

hparams.update({
    "do_train": False,
    "do_eval": False,
    "do_test": False,
    "do_predict": True,
    "predict_checkpoint_path": "gs://paper/electra-springerpaper-title+keywords+abstract/_1/model.ckpt-15000",
    "predict_split": "train,dev",
})


import json
with open("paper_config.json", "w") as outfile:
    json.dump(hparams, outfile)

!python3 run_finetuning.py \
    --data-dir=springer-paper/ \
    --model-name=test \
    --hparams=paper_config.json

In [None]:
with open("predict_dev.pickle", "rb") as infile:
    import pickle
    predict = pickle.load(infile)
import numpy as np
probabilities = np.asarray([item['springer-paper_logits'] for item in predict  if item['task_id'] == 0])

from finetune.classification.classification_tasks import springer_labels
journal_2_index = {journal: i for i, journal in enumerate(springer_labels)}

import json
y_valid = []
for line in open("../data/springer-paper/valid.jsonl"):
    line = json.loads(line)
    do_skip = False
    for feature in hparams['features']:
        if feature not in line or type(line[feature]) != str or \
            len(line[feature]) < 1: do_skip = True
    if not do_skip:
        y_valid.append(journal_2_index[line['journal'].lower()])
y_valid = np.asarray(y_valid)


print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)
print()



with open("predict_train.pickle", "rb") as infile:
    import pickle
    predict = pickle.load(infile)

import numpy as np
probabilities = np.asarray([item['springer-paper_logits'] for item in predict  if item['task_id'] == 0])
np.save("springer-electra-{}-train.npy".format("+".join(hparams['features'])), probabilities)

import json
y_valid = []
for line in open("../data/springer-paper/train.jsonl"):
    line = json.loads(line)
    y_valid.append(journal_2_index[line['journal'].lower()])
y_valid = np.asarray(y_valid)

print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)
print()

