# Imports

In [1]:
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

# Start a Spark Session 

In [2]:
spark = sparknlp.start(gpu=False)

# Load in the CoNLL Data

In [3]:
# Training Data
train = CoNLL().readDataset(spark, 'Data/trainingTextProcessed.txt')
train.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|To design simple ...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|To verify perform...|[{document, 0, 12...|[{document, 0, 12...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|The results CompE...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The PC1+PC2 score...|[{document, 0, 84...|[{document, 0, 84...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|We present web ap...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 1, We...|[{pos, 0, 1, PRP,..

In [4]:
test = CoNLL().readDataset(spark, 'Data/testingTextProcessed.txt')

# Define the Pipeline

In [5]:
# Normalize the text
normalizer = (
    Normalizer()
    .setInputCols(['token'])
    .setOutputCol('normalized')
    .setLowercase(False)
    .setCleanupPatterns(["[^\w\d\s]"])
)

# Get BERT word embeddings
elmo = (
    ElmoEmbeddings.pretrained()
    .setInputCols("sentence", "normalized")
    .setOutputCol("elmo")
)
# Define the NER deep learning model
nerTagger = (
    NerDLApproach()
    .setInputCols(["sentence", "normalized", "elmo"])
    .setLabelColumn("label")
    .setOutputCol("ner")
    .setMaxEpochs(20)
    .setLr(0.01) # learning rate
    .setPo(0.005) # learning rate decay
    .setBatchSize(64)
    .setRandomSeed(555)
    .setVerbose(1)
    .setValidationSplit(0.2) # Make a validation split
    .setEvaluationLogExtended(True)
    .setEnableOutputLogs(True)
    .setIncludeConfidence(True) # include confidence values
    .setTestDataset('Data/testingTextProcessed.parquet')
)
# Construct the pipeline
ner_pipeline = Pipeline(stages=[normalizer, elmo, nerTagger])

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


# Apply Elmo to Test Data

In [6]:
test = normalizer.fit(test).transform(test)
test = elmo.transform(test)
test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          normalized|                elmo|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SECIMTools ( Sout...|[{document, 0, 12...|[{document, 0, 12...|[{token, 0, 9, SE...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, SE...|[{word_embeddings...|
|SECIMTools levera...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 9, SE...|[{pos, 0, 9, NNP,...|[{named_entity, 0...|[{token, 0, 9, SE...|[{word_embeddings...|
|Our method , name...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, Ou...|[{pos, 0, 2, PRP$...|[{named_entity, 0...|[{token, 0, 2, Ou...|[{

# Write Test Data to Parquet

In [7]:
test.write.mode('overwrite').parquet("Data/testingTextProcessed.parquet")

# Fit the NER Model

In [8]:
ner_model = ner_pipeline.fit(train)

In [9]:
!cat /Users/jonathandekermanjian/annotator_logs/NerDLApproach_5ec37af665c0.log

Name of the selected graph: ner-dl/blstm_10_512_128_120.pb
Training started - total epochs: 20 - lr: 0.01 - batch size: 64 - labels: 2 - chars: 63 - training examples: 642


Epoch 1/20 started, lr: 0.01, dataset size: 642


Epoch 1/20 - 3.95s - loss: 146.65201 - batches: 13
Quality on validation dataset (20.0%), validation examples = 128
time to finish evaluation: 0.55s
label	 tp	 fp	 fn	 prec	 rec	 f1
T	 129	 51	 20	 0.71666664	 0.86577183	 0.7841945
tp: 129 fp: 51 fn: 20 labels: 1
Macro-average	 prec: 0.71666664, rec: 0.86577183, f1: 0.7841945
Micro-average	 prec: 0.71666664, rec: 0.86577183, f1: 0.7841945
Quality on test dataset: 
time to finish evaluation: 0.34s
label	 tp	 fp	 fn	 prec	 rec	 f1
T	 137	 65	 31	 0.6782178	 0.8154762	 0.7405405
tp: 137 fp: 65 fn: 31 labels: 1
Macro-average	 prec: 0.6782178, rec: 0.8154762, f1: 0.7405405
Micro-average	 prec: 0.6782178, rec: 0.8154762, f1: 0.7405405


Epoch 2/20 started, lr: 0.0099502485, dataset size: 642


Epoch 2/20 - 2.78s - loss: 2

# Test the model

In [10]:
# Apply the model onto the test data
predictions = ner_model.transform(test)

In [11]:
(
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.confidence').alias('Confidence'))
    .dropna()
    .filter('Truth != "O"')
    .dropDuplicates(['word', 'Prediction'])
    .show(70, truncate = False)
)

+-------------------+-----+----------+----------+
|word               |Truth|Prediction|Confidence|
+-------------------+-----+----------+----------+
|INMEX              |T    |T         |0.8518    |
|InterpretMSSpectrum|T    |O         |1.0       |
|DrugViz            |T    |T         |0.8682    |
|MarVis-Filter      |T    |O         |1.0       |
|GridMass           |T    |T         |0.9931    |
|MassyTools         |T    |T         |0.9868    |
|OpenMS             |T    |T         |0.9994    |
|SIRIUS             |T    |O         |1.0       |
|Escher             |T    |T         |0.9967    |
|ProfileDB          |T    |T         |0.6443    |
|INMEX              |T    |O         |1.0       |
|mzMatch-ISO        |T    |O         |1.0       |
|Pathos             |T    |T         |0.9016    |
|Pathos             |T    |O         |1.0       |
|jmzTab             |T    |O         |0.9999    |
|MetaboQuant        |T    |T         |0.8024    |
|MGV                |T    |O         |0.9999    |


# Conclusion

With only 20 epochs, our model is doing pretty descently with an F1 score of 79%. Looking at the logs we seem to be having a harder time with false negatives compared to false positives. However, from the table above you can see that some tools, depending on where within a sentence they are found, are predictied both as a tool (T) and as other (O). Running this model for more epochs and tuning the model hyperparameters should result in a good model for detecting metabolomics software tools with in text.