# Starting out with GENRE
By Carlos Gemmell Edited by Lewis Grimmer

GENRE is a seq2seq model capable of Entity Linking. It does this by autoregressivley generating the input text with extity annotations for both the spans and constrained links to existing wikipedia articles. The model can be run on CPU or GPU.

In [1]:
!git clone https://github.com/facebookresearch/GENRE.git
!pip install torch transformers
!wget http://dl.fbaipublicfiles.com/GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz -P ./GENRE/
!unzip ./GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz
!wget http://dl.fbaipublicfiles.com/GENRE/kilt_titles_trie_dict.pkl -P ./GENRE/

fatal: destination path 'GENRE' already exists and is not an empty directory.
--2022-03-16 17:05:50--  http://dl.fbaipublicfiles.com/GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 958456496 (914M) [application/gzip]
Saving to: ‘./GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz.1’


2022-03-16 17:06:12 (44.0 MB/s) - ‘./GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz.1’ saved [958456496/958456496]

Archive:  ./GENRE/hf_e2e_entity_linking_wiki_abs.tar.gz
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one

In [2]:
import sys
sys.path.append("./GENRE/")
import pickle
from genre.trie import Trie

In [3]:
with open("./GENRE/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from genre.entity_linking import get_end_to_end_prefix_allowed_tokens_fn_hf as get_prefix_allowed_tokens_fn
from genre.utils import get_entity_spans_hf as get_entity_spans
from genre.hf_model import GENRE
from pathlib import Path

pretrained_path = Path('/content/drive/MyDrive/4th year project/hf_e2e_entity_linking_wiki_abs')

model = GENRE.from_pretrained(pretrained_path).eval()

In [6]:
import torch
torch.cuda.empty_cache()
model = model.to('cuda')

In [7]:
model.device

device(type='cuda', index=0)

In [8]:
!nvidia-smi

Wed Mar 16 17:07:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    33W / 250W |   2561MiB / 16280MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
import re

#Class to hold the pre-trained model and helper functions
class GENREModel():

  def __init__(self, model):
    self.model = model

  def convert_sentences(self, sentences):
      prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, sentences)
      converted_sentences = model.sample(sentences, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, num_return_sequences=1, num_beams=5)
      return [s[0]['text'] for s in converted_sentences]

  def steps_to_sentences(self, steps):
    return [s for step in steps for s in step.split('.') if s]

  def get_mentions_from_steps(self, steps):
    sentences = self.steps_to_sentences(steps)
    
    converted_sentences = self.convert_sentences(sentences)

    pattern = r'\[ .+? \]'

    return [mention[2:-2] for sentence in converted_sentences for mention in re.findall(pattern, sentence)]

In [15]:
genre_model = GENREModel(model)

sentences = '''Apply the lash glue to the outer seam of the 
              eyelash strip with an applicator or small brush.'''

genre_model.get_mentions_from_steps(sentences)

[]