### System Installation
Installing the coref tool from mandarjoshi90 along with tensorflow.




In [1]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

name, driver_version, memory.total [MiB]
Tesla T4, 460.32.03, 15109 MiB


In [2]:
! git clone https://github.com/mandarjoshi90/coref.git

Cloning into 'coref'...
remote: Enumerating objects: 734, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 734 (delta 2), reused 0 (delta 0), pack-reused 728[K
Receiving objects: 100% (734/734), 4.17 MiB | 20.55 MiB/s, done.
Resolving deltas: 100% (441/441), done.


In [3]:
%cd coref
! sed 's/MarkupSafe==1.0/MarkupSafe==1.1.1/; s/scikit-learn==0.19.1/scikit-learn==0.21/; s/scipy==1.0.0/scipy==1.6.2/' < requirements.txt > tmp
! mv tmp requirements.txt

! sed 's/.D.GLIBCXX.USE.CXX11.ABI.0//' < setup_all.sh  > tmp
! mv tmp setup_all.sh 
! chmod u+x setup_all.sh 

/content/coref


In [4]:
%tensorflow_version 2.x
! pip uninstall -y tensorflow
! pip install -r requirements.txt --log install-log.txt -q
! ./setup_all.sh

Uninstalling tensorflow-2.4.1:
  Successfully uninstalled tensorflow-2.4.1
[K     |████████████████████████████████| 102kB 11.1MB/s 
[K     |████████████████████████████████| 1.2MB 40.9MB/s 
[K     |████████████████████████████████| 163kB 53.5MB/s 
[K     |████████████████████████████████| 6.6MB 27.9MB/s 
[K     |████████████████████████████████| 552kB 53.4MB/s 
[K     |████████████████████████████████| 61kB 9.2MB/s 
[K     |████████████████████████████████| 2.2MB 35.9MB/s 
[K     |████████████████████████████████| 266kB 55.5MB/s 
[K     |████████████████████████████████| 890kB 38.1MB/s 
[K     |████████████████████████████████| 133kB 57.1MB/s 
[K     |████████████████████████████████| 153kB 59.8MB/s 
[K     |████████████████████████████████| 51kB 7.2MB/s 
[K     |████████████████████████████████| 51kB 7.3MB/s 
[K     |████████████████████████████████| 92kB 13.6MB/s 
[K     |████████████████████████████████| 20.3MB 1.4MB/s 
[K     |████████████████████████████████| 2.1M

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Specifying Input

Input and Model

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
genre = "nw"
# The Ontonotes data for training the model contains text from several sources
# of very different styles. You need to specify the most suitable one out of:
# "bc": broadcast conversation
# "bn": broadcast news
# "mz": magazine
# "nw": newswire
# "pt": Bible text
# "tc": telephone conversation
# "wb": web data

model_name = "spanbert_base"
# The fine-tuned model to use. Options are:
# bert_base
# spanbert_base
# bert_large
# spanbert_large

In [8]:
import os
os.environ['data_dir'] = "./data"
os.environ['CHOSEN_MODEL'] = model_name

Downloading the selected model.

In [None]:
! ./download_pretrained.sh $CHOSEN_MODEL

Downloading spanbert_base
--2021-05-16 17:24:36--  http://nlp.cs.washington.edu/pair2vec/spanbert_base.tar.gz
Resolving nlp.cs.washington.edu (nlp.cs.washington.edu)... 128.208.3.120, 2607:4000:200:12::78
Connecting to nlp.cs.washington.edu (nlp.cs.washington.edu)|128.208.3.120|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1633726311 (1.5G) [application/x-gzip]
Saving to: ‘./data/spanbert_base.tar.gz’


Process the data to be in the required input format:

In [None]:
from bert import tokenization
import json


def tokenize(data):
  data = {
      'doc_key': genre,
      'sentences': [["[CLS]"]],
      'speakers': [["[SPL]"]],
      'clusters': [],
      'sentence_map': [0],
      'subtoken_map': [0],
  }

  # Determine Max Segment
  max_segment = None
  for line in open('experiments.conf'):
      if line.startswith(model_name):
          max_segment = True
      elif line.strip().startswith("max_segment_len"):
          if max_segment:
              max_segment = int(line.strip().split()[-1])
              break

  tokenizer = tokenization.FullTokenizer(vocab_file="cased_config_vocab/vocab.txt", do_lower_case=False)
  subtoken_num = 0
  for sent_num, line in enumerate(text):
      raw_tokens = line.split()
      tokens = tokenizer.tokenize(line)
      if len(tokens) + len(data['sentences'][-1]) >= max_segment:
          data['sentences'][-1].append("[SEP]")
          data['sentences'].append(["[CLS]"])
          data['speakers'][-1].append("[SPL]")
          data['speakers'].append(["[SPL]"])
          data['sentence_map'].append(sent_num - 1)
          data['subtoken_map'].append(subtoken_num - 1)
          data['sentence_map'].append(sent_num)
          data['subtoken_map'].append(subtoken_num)

      ctoken = raw_tokens[0]
      cpos = 0
      for token in tokens:
          data['sentences'][-1].append(token)
          data['speakers'][-1].append("-")
          data['sentence_map'].append(sent_num)
          data['subtoken_map'].append(subtoken_num)
          
          if token.startswith("##"):
              token = token[2:]
          if len(ctoken) == len(token):
              subtoken_num += 1
              cpos += 1
              if cpos < len(raw_tokens):
                  ctoken = raw_tokens[cpos]
          else:
              ctoken = ctoken[len(token):]

  data['sentences'][-1].append("[SEP]")
  data['speakers'][-1].append("[SPL]")
  data['sentence_map'].append(sent_num - 1)
  data['subtoken_map'].append(subtoken_num - 1)

  return data

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

## Prediction & Evaluation


In [None]:
%mkdir ./data/in
%mkdir ./data/out

In [None]:
import subprocess

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
filename = "/content/gdrive/MyDrive/CRAFT-txt/dev/17194222.txt"



text = []
data = []

with open(filename) as f_in:
  
  paragraphs = [line for line in f_in.read().split('\n') if line]
  
  for paragraph in paragraphs:
    sentences = tokenizer.tokenize(paragraph)
    text.extend(sentences)
                
chs = list(chunks(text, 100))

N = len(chs)

for i in range(0, N):
  
  text = chs[i]
  data.append(tokenize(text))

  file1 = "./data/in/" + str(i) + ".json"
  file2 = "./data/out/_" + str(i) + ".json"

  with open(file1, 'w') as out:
    json.dump(data[i], out, sort_keys=True)

  subprocess.call(['python', 'predict.py', 'spanbert_base', file1, file2])


In [None]:
import subprocess
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

for file in [f for f in os.listdir('/content/gdrive/MyDrive/CRAFT-txt/dev') if f.endswith('.txt')]:

  text = []
  data = []

  with open(filename) as f_in:
    
    paragraphs = [line for line in f_in.read().split('\n') if line]
  
    for paragraph in paragraphs:
      sentences = tokenizer.tokenize(paragraph)
      text.extend(sentences)
                
  chs = list(chunks(text, 100))

  N = len(chs)

  for i in range(0, N):
    
    text = chs[i]
    data.append(tokenize(text))

    file1 = "./data/in/"  + file[0:-4] + "_" + str(i) + ".json"
    file2 = "./data/out/" + file[0:-4] + "_" + str(i) + ".jsonlines"

    with open(file1, 'w') as out:
      json.dump(data[i], out, sort_keys=True)
    
    subprocess.call(['python', 'predict.py', 'spanbert_base', file1, file2])


In [None]:
! gpu=0 python predict.py $CHOSEN_MODEL ./data/in_0.json ./data/out_0.json

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
W0514 14:30:05.058650 139705853998976 deprecation_wrapper.py:119] From /content/coref/coref_ops.py:11: The name tf.NotDifferentiable is deprecated. Please use tf.no_gradient instead.

W0514 14:30:05.100141 139705853998976 deprecation_wrapper.py:119] From /content/coref/bert/optimization.py:87: The name tf.train.Opti

### Using Evaulate
currently not working

In [None]:
! gpu=0 python evaluate.py $CHOSEN_MODEL

## Output

In [None]:
output = json.load(open("./data/dev_2.jsonlines"))

comb_text = [word for sentence in output['sentences'] for word in sentence]

def convert_mention(mention):
    start = output['subtoken_map'][mention[0]]
    end = output['subtoken_map'][mention[1]] + 1
    nmention = (start, end)
    mtext = ''.join(' '.join(comb_text[mention[0]:mention[1]+1]).split(" ##"))
    return (nmention, mtext)

seen = set()
print('Clusters:')
for cluster in output['predicted_clusters']:
    mapped = []
    for mention in cluster:
        seen.add(tuple(mention))
        mapped.append(convert_mention(mention))
    print(mapped, end=",\n")

print('\nMentions:')
for mention in output['top_spans']:
    if tuple(mention) in seen:
        continue
    print(convert_mention(mention), end=",\n")

Clusters:
[((0, 1), 'We'), ((24, 25), 'we'), ((90, 91), 'we'), ((251, 252), 'we'), ((397, 398), 'we'), ((622, 623), 'we'), ((698, 699), 'we'), ((723, 724), 'We'), ((930, 931), 'we'), ((993, 994), 'we'), ((1130, 1131), 'us'), ((1158, 1159), 'we'), ((1205, 1206), 'our'), ((1270, 1271), 'We'), ((1685, 1686), 'we'), ((1824, 1825), 'our'), ((1833, 1834), 'We'), ((1847, 1848), 'we'), ((1859, 1860), 'Our'), ((1920, 1921), 'we'), ((1947, 1948), 'we')],
[((54, 57), 'The mean CV'), ((67, 71), 'this relatively low CV')],
[((97, 114), 'the DBA / 1 and FVB / N strains at all four phases of CIA , including NC , PI , OA and CA'), ((126, 129), 'the two strains'), ((135, 137), 'both strains'), ((164, 167), 'these two strains'), ((207, 210), 'the two strains'), ((600, 604), 'these two gene clusters'), ((644, 646), 'both lists'), ((818, 821), 'the two strains'), ((1282, 1288), 'both genetically susceptible and resistant strains'), ((1295, 1297), 'both strains'), ((1446, 1449), 'the two strains'), ((1510,

In [None]:
!zip -r /content/out.zip /content/coref/data/out 

  adding: content/coref/data/out/ (stored 0%)
  adding: content/coref/data/out/17696610_3.json (deflated 84%)
  adding: content/coref/data/out/17244351_1.json (deflated 83%)
  adding: content/coref/data/out/17590087_0.json (deflated 82%)
  adding: content/coref/data/out/17590087_2.json (deflated 83%)
  adding: content/coref/data/out/17608565_1.json (deflated 83%)
  adding: content/coref/data/out/17425782_2.json (deflated 83%)
  adding: content/coref/data/out/17244351_0.json (deflated 82%)
  adding: content/coref/data/out/17194222_1.json (deflated 83%)
  adding: content/coref/data/out/17447844_2.json (deflated 83%)
  adding: content/coref/data/out/17425782_0.json (deflated 82%)
  adding: content/coref/data/out/17194222_2.json (deflated 83%)
  adding: content/coref/data/out/17425782_4.json (deflated 85%)
  adding: content/coref/data/out/17696610_1.json (deflated 83%)
  adding: content/coref/data/out/17608565_0.json (deflated 82%)
  adding: content/coref/data/out/17590087_4.json (deflated

In [None]:
files.download("/content/out.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>