In [None]:
%tensorflow_version 1.x
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
%tensorflow_version 1.x
!mkdir -p drive
!google-drive-ocamlfuse drive
import os
os.chdir("drive/bert-paper/")

In [None]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

In [None]:
FEATURES = ["title", "keywords", "abstract"]
OUTDIR = "gs://paper/scibert-cs-{}/".format("+".join(FEATURES))
PRETRAINED_MODEL_DIR = "gs://bert-eng/scibert_scivocab_uncased"
DATA_DIR = "../data/" # Local data directory

if "abstract" in FEATURES:
    max_seq_length = 512
elif len(FEATURES) == 2:
    max_seq_length = 256
else:
    max_seq_length = 128
FEATURES = ",".join(FEATURES)

In [None]:
!python3 run_cs.py \
    --data_dir=$DATA_DIR \
    --features=$FEATURES \
    --bert_config_file=$PRETRAINED_MODEL_DIR/bert_config.json \
    --vocab_file=$PRETRAINED_MODEL_DIR/vocab.txt \
    --output_dir=$OUTDIR \
    --init_checkpoint=$PRETRAINED_MODEL_DIR/bert_model.ckpt \
    --do_lower_case=True \
    --max_seq_length=$max_seq_length \
    --do_train=True \
    --do_eval=False \
    --num_train_epochs=10 \
    --train_batch_size=128 \
    --save_checkpoints_steps=1000 \
    --keep_checkpoint_max=1000 \
    --use_tpu=True \
    --tpu_name=$TPU_ADDRESS

In [None]:
!python3 run_cs.py \
    --data_dir=$DATA_DIR \
    --features=$FEATURES \
    --bert_config_file=$PRETRAINED_MODEL_DIR/bert_config.json \
    --vocab_file=$PRETRAINED_MODEL_DIR/vocab.txt \
    --output_dir=$OUTDIR \
    --do_lower_case=True \
    --max_seq_length=$max_seq_length \
    --do_train=False \
    --do_eval=True \
    --eval_batch_size=128 \
    --save_checkpoints_steps=1000 \
    --keep_checkpoint_max=20 \
    --use_tpu=True \
    --tpu_name=$TPU_ADDRESS

In [None]:
!python3 run_cs.py \
    --data_dir=$DATA_DIR \
    --features=$FEATURES \
    --bert_config_file=$PRETRAINED_MODEL_DIR/bert_config.json \
    --vocab_file=$PRETRAINED_MODEL_DIR/vocab.txt \
    --output_dir=$OUTDIR \
    --do_lower_case=True \
    --max_seq_length=$max_seq_length \
    --do_predict=True \
    --predict_checkpoint_path=gs://paper/scibert-cs-keywords+abstract/model.ckpt \
    --predict_batch_size=128 \
    --use_tpu=True \
    --tpu_name=$TPU_ADDRESS

In [None]:
import os
import numpy as np
lines = []
for line in tf.gfile.Open(os.path.join(OUTDIR, "test_results.tsv")):
    lines.append(line.strip())    
probabilities = np.empty((len(lines), len(lines[0].split())))

for ii, line in enumerate(lines):
    probabilities[ii,:] = list(map(float, line.split()))
np.save("cs-scibert-{}-valid.npy".format(FEATURES.replace(",","+")), probabilities)

from run_cs import PaperProcessor
prosessor = PaperProcessor(FEATURES.split(","))
journal_2_index = {journal: i for i, journal in enumerate(prosessor.get_labels())}

import json
y_valid = []
for line in prosessor._read_jsonl(os.path.join(DATA_DIR, "cs-valid.jsonl")):
    y_valid.append(journal_2_index[line['journal'].lower()])
y_valid = np.asarray(y_valid)   

print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)