# Imports

In [1]:
import sparknlp
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, SQLContext
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

# Start a Spark Session 

In [2]:
spark = sparknlp.start(gpu=False)
# sqlContext = SQLContext(spark)
# sqlContext.setConf('spark.sql.shuffle.partitions', '10') 

# Load in the CoNLL Data

In [3]:
# Training Data
train = CoNLL().readDataset(spark, 'Data/trainingTextProcessed.txt')
train.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Imaging-AMARETTO ...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 15, I...|[{pos, 0, 15, NN,...|[{named_entity, 0...|
|The availability ...|[{document, 0, 22...|[{document, 0, 22...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|Here , present Im...|[{document, 0, 25...|[{document, 0, 25...|[{token, 0, 3, He...|[{pos, 0, 3, RB, ...|[{named_entity, 0...|
|To demonstrate ut...|[{document, 0, 34...|[{document, 0, 34...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|Our results show ...|[{document, 0, 20...|[{document, 0, 20...|[{token, 0, 2, Ou...|[{pos, 0, 2, PRP$..

In [4]:
test = CoNLL().readDataset(spark, 'Data/testingTextProcessed.txt')

# Define the Pipeline

In [5]:
# Normalize the text
normalizer = (
    Normalizer()
    .setInputCols(['token'])
    .setOutputCol('normalized')
    .setLowercase(False)
#     .setCleanupPatterns(["[,\.\s]"])
    .setCleanupPatterns(["[^\w\d\s]"]) # Removes punctuation
)

# Get ELMo word embeddings
elmo = (
    ElmoEmbeddings.pretrained()
    .setInputCols("sentence", "normalized")
    .setOutputCol("elmo")
)
# Define the NER deep learning model
nerTagger = (
    NerDLApproach()
    .setInputCols(["sentence", "normalized", "elmo"])
    .setLabelColumn("label")
    .setOutputCol("ner")
    .setMaxEpochs(50)
    .setLr(0.01) # learning rate
    .setPo(0.005) # learning rate decay
    .setBatchSize(1024)
    .setRandomSeed(777)
    .setVerbose(1)
    .setValidationSplit(0.1) # Make a validation split
    .setEvaluationLogExtended(True)
    .setEnableOutputLogs(True)
    .setIncludeConfidence(True) # include confidence values
    #.setTestDataset('Data/testingTextProcessed_full.parquet')
    .setTestDataset('Data/testingTextProcessed.parquet')
)
# Construct the pipeline
ner_pipeline = Pipeline(stages=[normalizer, elmo, nerTagger])

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


# Apply Elmo to Test Data

In [6]:
test = normalizer.fit(test).transform(test)
test = elmo.transform(test)
test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          normalized|                elmo|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Pathway Tools ver...|[{document, 0, 88...|[{document, 0, 88...|[{token, 0, 6, Pa...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|[{token, 0, 6, Pa...|[{word_embeddings...|
|Biological system...|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 9, Bi...|[{pos, 0, 9, JJ, ...|[{named_entity, 0...|[{token, 0, 9, Bi...|[{word_embeddings...|
|Our development P...|[{document, 0, 20...|[{document, 0, 20...|[{token, 0, 2, Ou...|[{pos, 0, 2, PRP$...|[{named_entity, 0...|[{token, 0, 2, Ou...|[{

# Write Test Data to Parquet

In [7]:
test.write.mode('overwrite').parquet("Data/testingTextProcessed.parquet")

# Fit the NER Model

In [8]:
ner_model = ner_pipeline.fit(train)

In [None]:
!cat /Users/jonathandekermanjian/annotator_logs/NerDLApproach_5b5f74d78e21.log

# Test the model

In [9]:
# Apply the model onto the test data
predictions = ner_model.transform(test)

In [10]:
(
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.confidence').alias('Confidence'))
    .dropna()
    .filter('Truth != "O"')
    .dropDuplicates(['word', 'Prediction'])
    .orderBy(['word','Truth','Prediction'], ascending=False)
    .show(100, truncate = False)
)

+----+-----+----------+----------+
|word|Truth|Prediction|Confidence|
+----+-----+----------+----------+
+----+-----+----------+----------+



In [11]:
(
    predictions
    .select(F.explode(F.arrays_zip('token.result','label.result', 'ner.result', 'ner.metadata')).alias('cols'))
    .select(F.col('cols.0').alias('word'),
            F.col('cols.1').alias('Truth'),
            F.col('cols.2').alias('Prediction'),
            F.col('cols.3.confidence').alias('Confidence'))
    .dropna()
    .filter('Truth != "T"')
    .dropDuplicates(['word', 'Prediction'])
    .orderBy('Prediction', ascending=False)
    .show(70, truncate = False)
)

+----+-----+----------+----------+
|word|Truth|Prediction|Confidence|
+----+-----+----------+----------+
+----+-----+----------+----------+



# Save the model

In [None]:
ner_model.write().overwrite().save("NER_model")

# Conclusion

With only 20 epochs, our model is doing pretty descently with an F1 score of 79%. Looking at the logs we seem to be having a harder time with false negatives compared to false positives. However, from the table above you can see that some tools, depending on where within a sentence they are found, are predictied both as a tool (T) and as other (O). Running this model for more epochs and tuning the model hyperparameters should result in a good model for detecting metabolomics software tools with in text.