# Library

In [None]:
%tensorflow_version 1.x
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
%tensorflow_version 1.x
!mkdir -p drive
!google-drive-ocamlfuse drive
import os
os.chdir("drive/xlnet-paper/")

In [None]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

In [None]:
!pip3 install sentencepiece

In [None]:
FEATURES = ["title", "keywords", "abstract"] # ["title"]
DATA_DIR = "../data/cs-paper/"
OUTPUT_DIR = "gs://paper/xlnet-base-cs-{}/".format("+".join(FEATURES))
PRETRAIN_MODEL_DIR = "gs://bert-eng/xlnet-base-cased"

if "abstract" in FEATURES:
    max_seq_length = 512
elif len(FEATURES) == 2:
    max_seq_length = 256
else:
    max_seq_length = 128
fff = ",".join(FEATURES)

# Main code

In [None]:
!python3 run_classifier_cs.py \
  --features=$fff \
  --overwrite_data=True \
  --eval_split=dev \
  --use_tpu=True \
  --tpu=$TPU_ADDRESS \
  --do_train=False \
  --do_eval=False \
  --do_predict=True \
  --data_dir=$DATA_DIR \
  --predict_ckpt=$OUTPUT_DIR/model.ckpt \
  --output_dir=$OUTPUT_DIR \
  --model_dir=$OUTPUT_DIR \
  --predict_dir=$OUTPUT_DIR \
  --uncased=True \
  --spiece_model_file=spiece.model \
  --model_config_path=$PRETRAIN_MODEL_DIR/xlnet_config.json \
  --max_seq_length=$MAX_SEQ_LENGTH \
  --train_batch_size=80 \
  --eval_batch_size=80 \
  --num_hosts=1 \
  --num_core_per_host=8 \
  --learning_rate=5e-5 \
  --num_train_epochs=14.0 \
  --warmup_proportion=0.1 \
  --save_steps=1000 \
  --iterations=1000

In [None]:
import json
with tf.io.gfile.GFile(os.path.join(OUTPUT_DIR, "paper.logits.json"), "r") as infile:
    probabilities = json.load(infile)

import numpy as np
probabilities = np.asarray(probabilities)

# np.save("cs-xlnet-{}-valid.npy".format("+".join(FEATURES)), probabilities)

from run_classifier_cs import PaperProcessor
journals = PaperProcessor(FEATURES).get_labels()
journal_to_idx = {journal:ii for ii, journal in enumerate(journals)}

y_valid = []
for item in open(os.path.join(DATA_DIR, "valid.jsonl")):
    item = json.loads(item)
    do_skip = False
    for feature in FEATURES:
        if feature not in item or type(item[feature]) != str or \
            len(item[feature]) < 1: do_skip = True
    if not do_skip:
        y_valid.append(journal_to_idx[item['journal'].lower()])
y_valid = np.asarray(y_valid)

print(np.mean(np.repeat(y_valid, 1).reshape(-1,1) == np.argsort(probabilities, axis=-1)[:,:-2:-1])*1)
print(np.mean(np.repeat(y_valid, 3).reshape(-1,3) == np.argsort(probabilities, axis=-1)[:,:-4:-1])*3)
print(np.mean(np.repeat(y_valid, 5).reshape(-1,5) == np.argsort(probabilities, axis=-1)[:,:-6:-1])*5)
print(np.mean(np.repeat(y_valid, 10).reshape(-1,10) == np.argsort(probabilities, axis=-1)[:,:-11:-1])*10)