### Check Environment

In [None]:
!/opt/bin/nvidia-smi

Sat Apr 15 23:05:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!python -V

Python 3.9.16


### Import GitHub Project
https://github.com/David-Tong/transformers
with my changes to get logits for training data, to train skim model

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
from shutil import rmtree

os.chdir('/content/drive/My Drive/git')

GIT_PATH='/content/drive/My Drive/git/transformers'

FORCE_UPDATE=False
if FORCE_UPDATE:
  if os.path.exists(GIT_PATH):
    rmtree(GIT_PATH)

# w266 uses v4.27.0 base
if not os.path.exists(GIT_PATH):
  !git clone https://github.com/David-Tong/transformers.git -b w266
else:
  !git fetch --all

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


### Import Modules

In [None]:
os.chdir('/content/drive/My Drive/git/transformers')

In [None]:
%%capture

# install transforms locally
!pip install -e . 
!pip install -r examples/pytorch/question-answering/requirements.txt

### Improved Type 2 Skim Read Model
Output eval_null_odds.json file for training data to SPANBERT_SKIM_DIR

In [None]:
os.chdir('/content/drive/My Drive/git/transformers')

In [None]:
!python examples/pytorch/question-answering/run_qa.py \
              --model_name_or_path SAVED_MODEL \
              --overwrite_cache True \
              --dataset_name squad_v2 \
              --do_eval \
              --train_skim True \
              --version_2_with_negative \
              --max_seq_length 384 \
              --doc_stride 128 \
              --per_device_eval_batch_size 32  \
              --output_dir SPANBERT_SKIM_DIR \
              --save_strategy "steps" \
              --save_steps 3000

04/15/2023 19:02:30 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_pri

### Train Skim Model
Use training data to train Skim Model

Open SPANBERT_SKIM_DIR/eval_null_odds.json and read logits for their answerable

Open DATA/train-v2.0.json and read is_impossible for if their answerable

In [None]:
os.chdir('/content/drive/My Drive/git/transformers')

import json
from collections import defaultdict

import numpy as np

skims = defaultdict(list)

In [None]:
### skim model X
eval_null_odds_file = open('SPANBERT_SKIM_DIR/eval_null_odds.json')
eval_null_odds = json.load(eval_null_odds_file)

for key in eval_null_odds:
  skims[key].append(eval_null_odds[key])

In [None]:
### skim model Y
train_v20_file = open('DATA/train-v2.0.json')
train_v20 = json.load(train_v20_file)

for group in train_v20['data']:
  for paragraph in group['paragraphs']:
    for qas in paragraph['qas']:
      key = qas['id']
      if key in skims:
        skims[key].append(qas['is_impossible'])  

In [None]:
### check skim mode X and Y data
for key in skims:
  if len(skims[key]) < 2:
    print(key, skims[key])

print(len(skims))

130319


In [None]:
X = list()
y = list()

for values in skims.values():
  X.append(values[0])
  y.append(values[1])

X = np.array(X)
y = np.array(y)

X = X.reshape(-1, 1)
y = y.reshape(-1, 1)

In [None]:
from sklearn.linear_model import LogisticRegression

skim_model = LogisticRegression()
skim_model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [None]:
skim_model.score(X, y)

0.9831950828351967

### Skim Predict for Validation Dataset

In [None]:
validation_eval_null_odds_file = open('SAVED_MODEL/eval_null_odds.json')
validation_eval_null_odds = json.load(validation_eval_null_odds_file)

skim_null_odds = dict()

for key in validation_eval_null_odds:
  unanswerable = skim_model.predict(np.array(validation_eval_null_odds[key]).reshape(-1, 1))[0]
  if unanswerable:
    skim_null_odds[key] = 1
  else:
    skim_null_odds[key] = -1

In [None]:
print(validation_eval_null_odds['56ddde6b9a695914005b9628'])

-14.972944259643555


In [None]:
print(skim_null_odds)

with open('SPANBERT_SKIM_DIR/skim_null_odds.json', 'w') as output:
  output.write(json.dumps(skim_null_odds, indent=4))

{'56ddde6b9a695914005b9628': -1, '56ddde6b9a695914005b9629': -1, '56ddde6b9a695914005b962a': -1, '56ddde6b9a695914005b962b': -1, '56ddde6b9a695914005b962c': -1, '5ad39d53604f3c001a3fe8d1': 1, '5ad39d53604f3c001a3fe8d2': 1, '5ad39d53604f3c001a3fe8d3': 1, '5ad39d53604f3c001a3fe8d4': 1, '56dddf4066d3e219004dad5f': -1, '56dddf4066d3e219004dad60': -1, '56dddf4066d3e219004dad61': -1, '5ad3a266604f3c001a3fea27': 1, '5ad3a266604f3c001a3fea28': 1, '5ad3a266604f3c001a3fea29': 1, '5ad3a266604f3c001a3fea2a': -1, '5ad3a266604f3c001a3fea2b': 1, '56dde0379a695914005b9636': -1, '56dde0379a695914005b9637': -1, '5ad3ab70604f3c001a3feb89': 1, '5ad3ab70604f3c001a3feb8a': 1, '56dde0ba66d3e219004dad75': -1, '56dde0ba66d3e219004dad76': -1, '56dde0ba66d3e219004dad77': -1, '5ad3ad61604f3c001a3fec0d': 1, '5ad3ad61604f3c001a3fec0e': 1, '5ad3ad61604f3c001a3fec0f': 1, '5ad3ad61604f3c001a3fec10': -1, '56dde1d966d3e219004dad8d': 1, '5ad3ae14604f3c001a3fec39': 1, '5ad3ae14604f3c001a3fec3a': 1, '56dde27d9a695914005b96