# Get Data fom Local Storage

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
sourcePath = "/gdrive/MyDrive/GermanDataSets/CourtRulings/100_SentenceSplits/data/V1/"
!ls {sourcePath.replace(' ', '\ ')} -lha

total 51M
-rw------- 1 root root 2.9M Aug 21 21:26 100_2019-02-19_oldp_cases_textout_SentenceSplitter.csv
-rw------- 1 root root 2.9M Aug 22 16:06 100_array_2019-02-19_oldp_cases_textout.json
-rw------- 1 root root  21M Aug 21 21:39 1k_2019-02-19_oldp_cases_textout_SentenceSplitter.csv
-rw------- 1 root root  25M Aug 22 16:06 1k_array_2019-02-19_oldp_cases_textout.json
drwx------ 2 root root 4.0K Aug 22 20:37 1k_SentenceDetectorDLModel
-rw------- 1 root root 3.2K Aug 21 21:22 abbreviations_relevant_dictionary.txt
-rw------- 1 root root 1.6K Aug 22 15:18 abbreviations_relevant_dictionary_wo_periods.txt


# Setup Spark

In [3]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-08-22 20:38:26--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-08-22 20:38:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-08-22 20:38:27 (47.4 MB/s) - written to stdout [1608/1608]

setup Colab for PySpark 3.1.2 and Spark NLP 3.2.1
Get:1 https://cloud.r-project.org/bin/li

In [4]:
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp

spark = sparknlp.start(gpu=True)

print("Spark version:", spark.version)
print("Spark NLP version:", sparknlp.version())

Spark version: 3.1.2
Spark NLP version: 3.2.1


# Read Training Data & Train Sentences Detector Model to recognize Abbreviations

In [5]:
#inputFile = '100_2019-02-19_oldp_cases_textout_SentenceSplitter.csv'
inputFile = '1k_2019-02-19_oldp_cases_textout_SentenceSplitter.csv'

In [6]:
trainingDF = spark.read.format('com.databricks.spark.csv')                                \
           .options( delimiter='|', header='true', inferschema='true', encoding="utf-8")  \
           .load(sourcePath + inputFile)                                                  \
           .where('ContainsAbbr = 1')                                                     \
           .select('Text')                                                                  \
           .withColumnRenamed('Text', 'text')

trainingDF.printSchema()
trainingDF.count()

root
 |-- text: string (nullable = true)



44721

In [7]:
def readFile(fileName):
  fileObj = open(fileName, "r") #opens the file in read mode
  words = fileObj.read().splitlines() #puts the file into an array
  fileObj.close()
  return words

abbrevitions  = readFile(sourcePath + 'abbreviations_relevant_dictionary.txt')
print (abbrevitions)

['-drs.', '1. zivilkammer', '10. zivilkammer', '11. zivilkammer', '12. zivilkammer', '13. zivilkammer', '14. zivilkammer', '15. zivilkammer', '16. zivilkammer', '17. zivilkammer', '18. zivilkammer', '19. zivilkammer', '2.\xa0zivilkammer', '20. zivilkammer', '21. zivilkammer', '22. zivilkammer', '23. zivilkammer', '24. zivilkammer', '25. zivilkammer', '26. zivilkammer', '27. zivilkammer', '28. zivilkammer', '29. zivilkammer', '3. zivilkammer', '30. zivilkammer', '31. zivilkammer', '32. zivilkammer', '33. zivilkammer', '34. zivilkammer', '35. zivilkammer', '36. zivilkammer', '37. zivilkammer', '38. zivilkammer', '39. zivilkammer', '4. zivilkammer', '5. zivilkammer', '6. zivilkammer', '7. zivilkammer', '8. zivilkammer', '9  . zivilkammer', '9. zivilkammer', 'a.', 'aao.', 'abbldg.', 'abdr.', 'abl.', 'abs.', 'abt.', 'akad.', 'akt.', 'alt.', 'amtl.', 'anh.', 'anl.', 'anm.', 'art. 1', 'art. 2', 'art. 3', 'art. 4', 'art. 5', 'art. 6', 'art. 7', 'art. 8', 'art. 9', 'art. i', 'art. v', 'aufl.', 

In [8]:
logPath = './logs/'
!mkdir logPath

In [9]:
!rm -r -f {logPath + '*.*'}

See [SentenceDetectorDLApproach API](https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.SentenceDetectorDLApproach.html) for more details.

In [10]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text")                \
    .setOutputCol("document")

sentenceDetector = SentenceDetectorDLApproach() \
    .setInputCols(["document"])                 \
    .setOutputCol("sentences")                  \
    .setEpochsNumber(3)                         \
    .setValidationSplit(.2)                     \
    .setOutputLogsPath(logPath)                 \
    .setImpossiblePenultimates(abbrevitions)

pipeline = Pipeline().setStages([documentAssembler, sentenceDetector])

In [11]:
%%time
model = pipeline.fit(trainingDF)

CPU times: user 382 ms, sys: 57.2 ms, total: 439 ms
Wall time: 1min 32s


In [12]:
ls {logPath} -lha

total 12K
drwxr-xr-x 2 root root 4.0K Aug 22 20:41 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4.0K Aug 22 20:41 [01;34m..[0m/
-rw-r--r-- 1 root root  261 Aug 22 20:42 sentence_detector_dl.log


In [13]:
!head -n 50 {logPath + 'sentence_detector_dl.log'}

Training 3 epochs
Epoch 1/3	47.26s	Loss: 892.9996	ACC: 0.9477721	Validation ACC: 0.97440016
Epoch 2/3	15.54s	Loss: 440.00717	ACC: 0.97704667	Validation ACC: 0.9774851
Epoch 3/3	14.89s	Loss: 377.01813	ACC: 0.9806159	Validation ACC: 0.9803536
Training completed.


In [14]:
testDF = spark.createDataFrame(
    [
        (1, 'Ich teste mal eine Referenz mit Auflage gem. Hüßtege in Thomas/Putzo ZPO 39. Aufl. § 72a GVGR. Eigentlich sollte das doch funktionieren? Oder nicht?'), # create your data here, be consistent in the types.
        (2, 'Bin mal gespannt ob das i.S.d. Erfinders ist. Der Erfinder hatte eventuell etwas ganz anderes im Sinn?'),
    ],
    ['id', 'text'] # add your columns label here
)

testDF.printSchema()
testDF.show()

root
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)

+---+--------------------+
| id|                text|
+---+--------------------+
|  1|Ich teste mal ein...|
|  2|Bin mal gespannt ...|
+---+--------------------+



In [15]:
model.transform(testDF).select('sentences').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|sentences                                                                                                                                                                                                                                                                            |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{document, 0, 93, Ich teste mal eine Referenz mit Auflage gem. Hüßtege in Thomas/Putzo ZPO 39. Aufl. § 72a GVGR., {sentence -> 0}, []}, {document, 95, 135, Ei

In [16]:
print (type(model))
print (type(model.stages[0]))
print (type(model.stages[1]))

<class 'pyspark.ml.pipeline.PipelineModel'>
<class 'sparknlp.base.DocumentAssembler'>
<class 'sparknlp.annotator.SentenceDetectorDLModel'>


In [17]:
#Save and re-load the Sentence Detector model later as part of a different pipeline if you are happy with it.
model.stages[1].write().overwrite().save(sourcePath + '1k_SentenceDetectorDLModel')