In [1]:
"""
End-to-end RDR run over entity tokens in WikiNeural (English).

Usage
-----
python -m scripts.run_rdr_llama_ent --layer 27 --k 8 --t 10 --split test_en
python -m scripts.run_rdr_llama_ent --layer 27 --k 8 --t 10 --split test_en
"""

import sys
sys.path.insert(0, '/data8/baek/dehallu/RDR')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import argparse
import numpy as np

from models.language.llama_rdr          import LlamaRDR
from models.language.config_utils       import getconfigs_entities, config_dist
from models.rdr                         import RDR
from utils.visualize_text           import visualize_entity
from data.nlp import wikineural_ent, hallu

In [2]:
class Args:
    def __init__(self):
        self.layer = 27
        self.k = 8
        self.t = 10
        self.split = "test_en"
        self.batch_size = 8
        self.dataset = "hallucination"

args = Args()

In [3]:

# 0  Load dataset & model
if args.dataset == "wikineural_ent":
    dataset = wikineural_ent.get_split(args.split)
elif args.dataset == "hallucination":
    dataset = hallu.get_split(args.split)
else:
    raise ValueError(f"Unknown dataset: {args.dataset}")


In [4]:
args.layer = 15

In [5]:
model   = LlamaRDR(target_layer=args.layer, capture_seq_pos=None)  # keep full seq

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# 1  Pick a random target instance
rand_target = np.random.randint(len(dataset))
rand_target = 360  # for debugging
print(f"Target sample index: {rand_target}  | entity = {dataset[rand_target]['entity']}")
print(f"Target sample: {dataset[rand_target]['sentence']}")

Target sample index: 360  | entity = Seoul
Target sample: The bustling capital of South Korea is Seoul.


In [7]:

# 2  Gather matrices
feats, configs = getconfigs_entities(dataset, model,
                                        batch_size=args.batch_size)

Extracting features and configs: 100%|██████████| 51/51 [00:03<00:00, 15.96it/s]


In [12]:

# 3  Distances
dists      = config_dist(configs, rand_target)
neighbours = np.argsort(dists)
print(f"Nearest neighbours: {neighbours[:args.k]}")

Nearest neighbours: [360 342 353 292 331 351 270 268 339 348 332 355 284 316 264]


In [None]:
args.k = 5
args.t = 10

In [18]:
print(len(configs[0]))
print(model.hidden_size)

4096
4096


In [13]:
# 4  RDR
rdr   = RDR(neighbours, configs)
samp, neurons, states = rdr.selection(k=args.k, t=args.t)

15 decision boundaries are used for constructing relaxed decision region.
The number of samples included in the Relaxed Decision Region: 15


In [11]:
# 5  Visualise
visualize_entity(samp, dataset, print_max=100)

The capital city of Canada is [1;34mOttawa[0m.
The country known as the Land of the Rising Sun is [1;34mJapan[0m.
The first book of the Bible is [1;34mGenesis[0m.
The fictional detective who lives at 221B Baker Street is [1;34mSherlock Holmes[0m.
The largest planet in our Solar System is [1;34mJupiter[0m.
The capital of the U.S. state of Georgia is [1;34mAtlanta[0m.
The Roman goddess of love and beauty is [1;34mVenus[0m.
The city where the Golden Gate Bridge is found is [1;34mSan Francisco[0m.
The bustling capital of South Korea is [1;34mSeoul[0m.
The largest moon of Jupiter is [1;34mGanymede[0m.
The capital city of Australia is [1;34mCanberra[0m.
The city where the Colosseum is located is [1;34mRome[0m.
The Italian city famous for its canals is [1;34mVenice[0m.
The basketball player known as His Airness is [1;34mMichael Jordan[0m.
The Greek god of the sea is [1;34mPoseidon[0m.
