In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

In [None]:
# Define a function to process the data
def processText(text):
    # Get abstracts into one continuous string
    text = text.str.cat()
    # Tokenize the string object by word
    text = word_tokenize(text)
    # Remove stop words
    stop = set(stopwords.words('english'))
    text = [w for w in text if not w in stop]
    # Tag each word by the appropriate part of speech (POS) tag
    text = pos_tag(text)
    # Reshape the data into a dataframe
    text = pd.DataFrame(text, columns=['word','POS'])
    return(text)

# Define a function that identifies where sentences begin/end
def identSentence(textDf):
    # start at sentence 1
    n_sent = 1
    sents = [] # init empty array to wholed sentence identifiers
    # Loop through text incrementing n_sent after each period
    for word in textDf.word:
        if word == ".":
            sents.append(n_sent)
            n_sent += 1
        else:
            sents.append(np.nan) # If we are still before the end of the sentence label it as NA
    textDf['Sent_id'] = sents # Generate a column of the sentences 
    textDf['Sent_id'] = textDf['Sent_id'].bfill() # back fill the NAs to get the correct sentence IDs
    return(textDf)

# Define a function that reformats the data into CoNLL format
def conllFormatter(textDf):
    conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
    save = 0

    for sent, token, pos, label in zip(textDf['Sent_id'], textDf['word'], textDf['POS'], textDf['label']):
        # If we start a new sentence, add empty line.
        if save!=sent:
            conll_lines+="\n"
        
        # Save the line
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
        save = sent
    return(conll_lines)

In [None]:
try:
    noNewAbstracts = pd.read_csv("noNewAbstracts.csv")
except:
    print("New abstracts are available.")

if 'noNewAbstracts' in globals():
    sys.exit(0)
PubMedAbstracts = pd.read_csv("PubMedAbstracts.csv", usecols=['pmid', 'title_abstract'])
text = PubMedAbstracts.title_abstract

In [None]:
text = processText(text)
text = identSentence(text)
text = conllFormatter(text)

# Output the processed training data to a txt file
with open("conll.txt", "w") as txtfile:
    for line in text:
        txtfile.write(line)
txtfile.close()

In [None]:
spark = sparknlp.start(gpu=False)
ner_model = PipelineModel.load("NER_model/")

In [None]:
# Normalize the text
normalizer = (
    Normalizer()
    .setInputCols(['token'])
    .setOutputCol('normalized')
    .setLowercase(False)
    .setCleanupPatterns(["[^\w\d\s]"])
)

# Get ELMo word embeddings
elmo = (
    ElmoEmbeddings.pretrained()
    .setInputCols("sentence", "normalized")
    .setOutputCol("elmo")
)

In [None]:
X = CoNLL().readDataset(spark, 'conll.txt')
X = normalizer.fit(test).transform(test)
X = elmo.transform(test)

In [None]:
# Apply the model onto the test data
predictions = ner_model.transform(X)

In [None]:
predictions = (
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.confidence').alias('Confidence'))
    .dropna()
    .filter('Truth != "O"')
    .dropDuplicates(['word', 'Prediction'])
)

predictions = predictions.toPandas()

In [None]:
# Check if tools are already in database or if they are already deemed as false positives
previousTools = None
try:
    previousTools = pd.read_csv("previousTools.csv")
except:
    print("No file found")

In [None]:
databaseTools = pd.read_csv("databaseTools.csv")

In [None]:
potentialTools = predictions[predictions.Prediction == 'T']
potentialTools = predictions[~predictions.word.isin(databaseTools)]

In [None]:
if previousTools is not None:
    previousTools = pd.concat([previousTools.squeeze(), databaseTools.squeeze()], ignore_index=True).drop_duplicates()
    newtools = predictions[~predictions.word.isin(previousTools)].dropna()
    previousTools = pd.concat([previousTools, newtools], ignore_index = True)
    previousTools.to_csv("previousTools.csv", encoding='utf-8', index=False)
    tools_output = newtools
else:
    predictions.to_csv("previousTools.csv", encoding="utf-8", index=False)