In [1]:
# !pip install -q ipymarkup

# Import

In [1]:
import os
import re
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy

nlp = spacy.load("en_core_web_lg")

In [2]:
path = '../../'
filename = "C07K.txt"
peptents_text = open(os.path.join(path, filename)).read().strip()
peptents_list = peptents_text.split("\n\n\n") #a list of peptide patents!
#if rereading peptents, delete already_done so that the text can be cleared
try:
    del already_done
except:
    print("Already done doesn't exist anyway")

def check_flawed_peptents(peptents_list):
    """
    Checks that each peptent contains one description and one claim section, not more, nor less.
    Parameters:
        peptents_list: A list of peptents
    Returns:
        descriptionless_peptents, claimless_peptents, doubles: lists of peptent indices which
            - do not contain a description
            - do not contain a claim
            - contain more than one description or claim 
    """
    #find patents without a description
    descriptionless_peptents = []
    claimless_peptents = []
    claims_string = "_____c:"
    descriptions_string = "_____d:"
    doubles = []

    #find all peptents where there is no description, no claim, or duplicates of them
    #this suggests we had a wrong delimiter
    for i,peptent in enumerate(peptents_list):
        description_count = len(re.findall(descriptions_string, peptent))
        claim_count = len(re.findall(claims_string, peptent))
        if description_count == 0:
            descriptionless_peptents.append(i)
        if claim_count == 0:
            claimless_peptents.append(i)
        if description_count > 1:
            print("Found stupid double description peptent at %d" %i)
            doubles.append(i)
        if claim_count > 1:
            print("Found stupid double claim peptent at %d" %i)
            doubles.append(i)      
    return descriptionless_peptents, claimless_peptents, doubles

descriptionless_peptents, claimless_peptents, doubles = check_flawed_peptents(peptents_list)
print("There are %d peptents with more than one description or claim" %len(doubles))
print("Peptent indices without a description:")
print(descriptionless_peptents)
print("Peptent indices without a claim:")
print(claimless_peptents)

Already done doesn't exist anyway
There are 0 peptents with more than one description or claim
Peptent indices without a description:
[180, 1132, 1481, 1482, 1483, 1987]
Peptent indices without a claim:
[180, 1132, 1480, 1481, 1482, 1987]


In [3]:
try: 
    print("Have the peptents been cleaned already?: %s" %already_done)
except:
    print("Cleaning the peptents.")
    #we haven't run this cell yet
    #after some verification, we found the following treatment:
    #drop 180, 1132: empty
    #merge 1483 into 1480: the former is the continuation of the latter
    #drop 1483: merged into 1480
    #drop 1481, 1482, 1987: empty or meaningless code
    peptents_list[1480]+= peptents_list[1483]
    #drop the elements in reverse to not mess with the indices
    peptents_list.pop(1987)
    peptents_list.pop(1483)
    peptents_list.pop(1482)
    peptents_list.pop(1481)
    peptents_list.pop(1132)
    peptents_list.pop(180)

#initialize the varaib
already_done = True

Cleaning the peptents.


In [4]:
descriptionless_peptents, claimless_peptents, doubles = check_flawed_peptents(peptents_list)
count_flawed = len(descriptionless_peptents) + len(claimless_peptents) + len(doubles)
print("Flawed peptents: %d" %count_flawed)

Flawed peptents: 0


# NER 🤗

### ʕ•́ᴥ•̀ʔっ♡ ktgiahieu/RoBERTa-large-PM-M3-Voc-hf-finetuned-ner-combine-filtered

In [48]:
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          pipeline,
                          )

model_checkpoint = "ktgiahieu/RoBERTa-large-PM-M3-Voc-hf-finetuned-ner-combine-filtered"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)                                                        
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model_pipeline = pipeline(task="ner", model=model, tokenizer=tokenizer)

c07k=open('../../C07K.txt').read().strip()
patent_lines = c07k.split('\n')

In [52]:
example_text = '\n'.join(patent_lines[10004:10008])
orig_entities = model_pipeline(example_text)

# Define the ClassLabel feature with the label names
label_list = ['B-Term','B-Lab_value','B-Chemical','I-Term','I-Lab_value','I-Chemical','O']

entities = []
for i in range(len(orig_entities)):
    orig_entities[i]['entity'] = label_list[int(orig_entities[i]['entity'][6:])]
    if orig_entities[i]['entity'] == 'O':
        continue
    entities.append(orig_entities[i])

# Show the results with highlighting

from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup, show_span_box_markup

entities_markup = []
for i in range(len(entities)):
    if len(entities_markup) == 0:
        entities_markup.append([entities[i]['start'], entities[i]['end'], entities[i]['entity'][2:]])
        continue
    if (entities[i]['start'] == entities[i-1]['end'] \
        or entities[i]['start'] == entities[i-1]['end']+1 \
        # or entities[i]['start'] == entities[i-1]['end']+2 \
        # or entities[i]['start'] == entities[i-1]['end']+3 \
       ) and \
        entities[i]['entity'][2:] == entities[i-1]['entity'][2:]:
        entities_markup[-1][1] = entities[i]['end']
    else:
        entities_markup.append([entities[i]['start'], entities[i]['end'], entities[i]['entity'][2:]])

show_span_box_markup(example_text, entities_markup)