# Lemmatize PATSTATusing Spark NLP

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path

## 1. Read papers and concatenate the `title` and `paperAbstract` fields

In [2]:
%%time

# Loading patents table text fields, and concatenating them for lemmatization
patents = spark.sql("SELECT appln_id, appln_title, appln_abstract FROM parquet.`/export/ml4ds/IntelComp/Datalake/PATSTAT/2022_Spring/patstat_appln.parquet`")
patents = patents.repartition(numPartitions=20000)
##For development purposes only
#patents = patents.sample(fraction=0.0001)

#Concatenate text fields to lemmatize
patents = (
    patents.withColumn("rawtext",F.concat_ws('. ', "appln_title", "appln_abstract"))
    .drop("appln_title")
    .drop("appln_abstract")
)

print('Number of patents before language filtering:', patents.count())

22/06/08 09:36:25 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/06/08 09:36:25 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/06/08 09:36:28 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/06/08 09:36:28 WARN metastore.ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jarenas@192.168.148.225
22/06/08 09:36:29 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
22/06/08 09:36:29 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
[Stage 3:>                                                          (0 + 1) / 1]

Number of patents before language filtering: 117081595
CPU times: user 105 ms, sys: 25.3 ms, total: 131 ms
Wall time: 49.2 s


                                                                                

## 2. Filter patents abstracts that are not in English Language

In [4]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
patents = pipeline.fit(patents).transform(patents)
patents = (
    patents.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of patents in English:', patents.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[OK!]


22/06/08 17:11:20 ERROR scheduler.TaskSchedulerImpl: Lost executor 9 on node13.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/08 17:11:20 WARN scheduler.TaskSetManager: Lost task 115.0 in stage 10.0 (TID 20751) (node13.cluster.tsc.uc3m.es executor 9): ExecutorLostFailure (executor 9 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/08 17:11:20 WARN scheduler.TaskSetManager: Lost task 79.0 in stage 10.0 (TID 20720) (node13.cluster.tsc.uc3m.es executor 9): ExecutorLostFailure (executor 9 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/08 17:11:20 WARN scheduler.TaskSetManager: Lost task 118.

Number of patents in English: 78594158
CPU times: user 4.35 s, sys: 2.31 s, total: 6.65 s
Wall time: 12h 7min 59s


                                                                                

## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [5]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
patents = pipeline.fit(patents).transform(patents)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
patents = (
    patents.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
#patents.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.




[ — ]

                                                                                

[OK!]
CPU times: user 135 ms, sys: 0 ns, total: 135 ms
Wall time: 9.24 s


## 4. Save a table with `appln_id`, `rawtext` and `lemmas` to HDFS

In [6]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/PATSTAT/2021_Autumn")

patents.coalesce(1000).write.parquet(
    dir_parquet.joinpath(f"patstat_appln_NLP.parquet").as_posix(),
    mode="overwrite",
)

22/06/09 00:42:42 ERROR scheduler.TaskSchedulerImpl: Lost executor 7 on node37.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/09 00:42:42 WARN scheduler.TaskSetManager: Lost task 28.0 in stage 15.0 (TID 40941) (node37.cluster.tsc.uc3m.es executor 7): ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/09 00:42:42 WARN scheduler.TaskSetManager: Lost task 5.0 in stage 15.0 (TID 40934) (node37.cluster.tsc.uc3m.es executor 7): ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/09 00:42:42 WARN scheduler.TaskSetManager: Lost task 67.0 i

CPU times: user 6.52 s, sys: 2.48 s, total: 9 s
Wall time: 15h 11min 27s


I0609 17:47:23.622265 19277 detector.cpp:152] Detected a new leader: (id='7863')
I0609 17:47:23.622398 19277 group.cpp:700] Trying to get '/mesos/json.info_0000007863' in ZooKeeper
I0609 17:47:23.623145 19277 zookeeper.cpp:262] A new leading master (UPID=master@10.0.12.77:5050) is detected
I0609 17:47:23.623587 19312 sched.cpp:345] New master detected at master@10.0.12.77:5050
I0609 17:47:23.623729 19312 sched.cpp:410] Authenticating with master master@10.0.12.77:5050
I0609 17:47:23.623739 19312 sched.cpp:417] Using default CRAM-MD5 authenticatee
I0609 17:47:23.623869 19292 authenticatee.cpp:121] Creating new client SASL connection
W0609 17:47:30.178761 19325 sched.cpp:455] Authentication timed out
I0609 17:47:30.178848 19312 sched.cpp:488] Failed to authenticate with master master@10.0.12.77:5050: Authentication discarded
I0609 17:47:30.178876 19312 sched.cpp:410] Authenticating with master master@10.0.12.77:5050
I0609 17:47:30.178884 19312 sched.cpp:417] Using default CRAM-MD5 authen

## 5. Optional: Check that the generated table looks OK

In [7]:
%%time

#Test that the saved table is correct
patents = spark.sql("SELECT * FROM parquet.`/export/ml4ds/IntelComp/Datalake/PATSTAT/2021_Autumn/patstat_appln_NLP.parquet`")
print('Number of lemmatized patents:', patents.count())
patents.show(n=10, truncate=120, vertical=True)

22/06/09 18:54:58 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of lemmatized patents: 78594158
-RECORD 0----------------------------------------------------------------------------------------------------------------------------
 appln_id | 25647584                                                                                                                 
 rawtext  | WATER-SOLUBLE AZOE COMPOUND, ITS PRODUCTION AND USE THEREOF AS DYE                                                       
 lemmas   | watersoluble azoe compound production use thereof dye                                                                    
-RECORD 1----------------------------------------------------------------------------------------------------------------------------
 appln_id | 473941040                                                                                                                
 rawtext  | Display screen with a graphical user interface                                                                           
 lemmas   | display scr