In [1]:
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

In [2]:
spark = sparknlp.start(gpu=False)

In [3]:
ner_model = PipelineModel.load("NER_model/")

In [4]:
# Normalize the text
normalizer = (
    Normalizer()
    .setInputCols(['token'])
    .setOutputCol('normalized')
    .setLowercase(False)
    .setCleanupPatterns(["[^\w\d\s]"])
)

# Get ELMo word embeddings
elmo = (
    ElmoEmbeddings.pretrained()
    .setInputCols("sentence", "normalized")
    .setOutputCol("elmo")
)

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [5]:
test = CoNLL().readDataset(spark, 'Data/testingTextProcessed.txt')
test = normalizer.fit(test).transform(test)
test = elmo.transform(test)
test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          normalized|                elmo|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SECIMTools ( Sout...|[{document, 0, 12...|[{document, 0, 12...|[{token, 0, 9, SE...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, SE...|[{word_embeddings...|
|SECIMTools levera...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 9, SE...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, SE...|[{word_embeddings...|
|Our method , name...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, Ou...|[{pos, 0, 2, PRP$...|[{named_entity, 0...|[{token, 0, 2, Ou...|[{

In [9]:
# Apply the model onto the test data
predictions = ner_model.transform(test)

In [10]:
predictions = (
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.confidence').alias('Confidence'))
    .dropna()
    .filter('Truth != "O"')
    .dropDuplicates(['word', 'Prediction'])
)

In [21]:
predictions = predictions.toPandas()

In [22]:
predictions[predictions.Prediction == 'T']

Unnamed: 0,word,Truth,Prediction,Confidence
0,INMEX,T,T,0.9888
2,DrugViz,T,T,0.9784
4,jmzTab,T,T,0.8094
5,GridMass,T,T,0.8908
6,MassyTools,T,T,0.9008
7,OpenMS,T,T,0.9998
9,Escher,T,T,0.9971
10,ProfileDB,T,T,0.9924
13,Pathos,T,T,0.9715
16,MetaboQuant,T,T,0.9949
