# Starting out with GENRE
By Carlos Gemmell Edited by Lewis Grimmer

GENRE is a seq2seq model capable of Entity Linking. It does this by autoregressivley generating the input text with extity annotations for both the spans and constrained links to existing wikipedia articles. The model can be run on CPU or GPU.

In [1]:
!git clone https://github.com/facebookresearch/GENRE.git
!pip install torch transformers
!wget http://dl.fbaipublicfiles.com/GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz -P ./GENRE/
!unzip ./GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz
!wget http://dl.fbaipublicfiles.com/GENRE/kilt_titles_trie_dict.pkl -P ./GENRE/

Cloning into 'GENRE'...
remote: Enumerating objects: 360, done.[K
remote: Counting objects: 100% (348/348), done.[K
remote: Compressing objects: 100% (216/216), done.[K
remote: Total 360 (delta 209), reused 233 (delta 128), pack-reused 12[K
Receiving objects: 100% (360/360), 10.96 MiB | 9.86 MiB/s, done.
Resolving deltas: 100% (209/209), done.
Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 85.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0

In [2]:
import sys
sys.path.append("./GENRE/")
import pickle
from genre.trie import Trie

In [3]:
with open("./GENRE/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from genre.entity_linking import get_end_to_end_prefix_allowed_tokens_fn_hf as get_prefix_allowed_tokens_fn
from genre.utils import get_entity_spans_hf as get_entity_spans
from genre.hf_model import GENRE
from pathlib import Path

pretrained_path = Path('/content/drive/MyDrive/4th year project/hf_e2e_entity_linking_wiki_abs')

model = GENRE.from_pretrained(pretrained_path).eval()

In [6]:
import torch
torch.cuda.empty_cache()
model = model.to('cuda')

In [7]:
model.device

device(type='cuda', index=0)

In [8]:
!nvidia-smi

Fri Nov 19 13:26:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    32W / 250W |   2681MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
import os, json
samples_path = '/content/drive/MyDrive/4th year project/samples'

files = os.listdir(samples_path)

samples = []

for f in files:
  with open(samples_path+'/'+f) as f:
    samples.append(json.load(f))

In [17]:
import re

#Class to hold the pre-trained model and helper functions
class GENRE_model():

  def __init__(self, model):
    self.model = model

  def convert_sentences(self, sentences):
      prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, sentences)
      converted_sentences = model.sample(sentences, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, num_return_sequences=1, num_beams=5)
      return [s[0]['text'] for s in converted_sentences]

  def steps_to_sentences(self, steps):
    return [s for step in steps for s in step.split('.') if s]

  def get_mentions_from_steps(self, steps):
    sentences = self.steps_to_sentences(steps)
    
    converted_sentences = self.convert_sentences(sentences)

    pattern = r'\[ .+? \]'

    return [mention[2:-2] for sentence in converted_sentences for mention in re.findall(pattern, sentence)]

In [18]:
genre_model = GENRE_model(model)

In [19]:
steps = "Obama Einstein"
genre_model.get_mentions_from_steps(steps)

[]