# Imports

In [1]:
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

# Start Spark

In [2]:
spark = sparknlp.start(gpu=False)

# Load the trained model

In [3]:
ner_model = PipelineModel.load("NER_model/")

# Define Processing Pipeline

In [31]:
# Normalize the text
normalizer = (
    Normalizer()
    .setInputCols(['token'])
    .setOutputCol('normalized')
    .setLowercase(False)
    .setCleanupPatterns(["[,\.\s]"])
#     .setCleanupPatterns(["[^\w\d\s]"]) # Removes punctuation
)

# Get ELMo word embeddings
elmo = (
    ElmoEmbeddings.pretrained()
    .setInputCols("sentence", "normalized")
    .setOutputCol("elmo")
)

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


# Load and Process Test Data

In [32]:
test = CoNLL().readDataset(spark, '../../../Natural Language Processing/Data/testingTextProcessed.txt')
test = normalizer.fit(test).transform(test)
test = elmo.transform(test)
test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          normalized|                elmo|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Metabolite identi...|[{document, 0, 71...|[{document, 0, 71...|[{token, 0, 9, Me...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, Me...|[{word_embeddings...|
|Metabolite identi...|[{document, 0, 79...|[{document, 0, 79...|[{token, 0, 9, Me...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, Me...|[{word_embeddings...|
|Various computati...|[{document, 0, 86...|[{document, 0, 86...|[{token, 0, 6, Va...|[{pos, 0, 6, JJ, ...|[{named_entity, 0...|[{token, 0, 6, Va...|[{

# Extract an Abstract

In [33]:
test_sample = test.withColumn("id",F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) .where(F.col("id").between(35,60))

In [34]:
test_sample.select('text').show(100,truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|contact @ workflow4metabolomics.org.MRMkit : Automated Data Processing Large-Scale Targeted Metabolomics Analysis .                                                                                                            |
|MRMkit open-source software package designed automated processing large-scale targeted mass spe

# Apply model on the Extracted Abstract

In [35]:
predict = ner_model.transform(test_sample)

# Evaluate Prediction

In [36]:
predict.select('ner').show(100,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Apply to all Test Data

In [37]:
predictions = ner_model.transform(test)

In [38]:
predictions = (
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.O').alias('Confidence O'),
            F.col('cols.3.T').alias('Confidence T'))
    .withColumn('Confidence', F.when(F.col('Confidence O')>F.col('Confidence T'), F.col('Confidence O')).otherwise(F.col('Confidence T')))
    .select('word','Truth','Prediction','Confidence')
    .dropna()
    .dropDuplicates(['word', 'Prediction'])
    .orderBy(['word','Truth','Prediction'], ascending=False)
)

In [39]:
predictions_PDF = predictions.toPandas()

In [40]:
predictions_PDF.shape

(2563, 4)

In [41]:
predictions_PDF[['Truth','Prediction','word']].groupby(['Truth', 'Prediction']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,word
Truth,Prediction,Unnamed: 2_level_1
O,O,2441
O,T,48
T,O,41
T,T,33
