# Get Data fom Local Storage

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [17]:
sourcePath = "/gdrive/MyDrive/GermanDataSets/CourtRulings/100_SentenceSplits/"
!ls {sourcePath.replace(' ', '\ ')} -lha

total 34M
-rw------- 1 root root 187K Aug 18 12:48 '00 Test Standard Sentence Detection.ipynb'
-rw------- 1 root root 2.8M Aug 18 12:53  100_2019-02-19_oldp_cases_textout.csv
-rw------- 1 root root 2.9M Aug 15 11:48  100_2019-02-19_oldp_cases_textout.json
-rw------- 1 root root  14K Aug 18 13:19 '10 Train Custom Sentence Detection.ipynb'
-rw------- 1 root root  25M Aug 15 09:56  1k_2019-02-19_oldp_cases_textout.json
-rw------- 1 root root 3.1M Aug 22 15:36 '20 Find Abbreviations.ipynb'
-rw------- 1 root root  22K Aug 22 15:36 '30 Train Custom Sentence Detection.ipynb'
-rw------- 1 root root  18K Aug 22 15:55 '40 Test Custom Sentence Detection.ipynb'
-rw------- 1 root root 3.2K Aug 19 12:19  abbreviations_relevant_dictionary.txt
-rw------- 1 root root 148K Aug 19 09:29  abbreviations.txt
drwx------ 3 root root 4.0K Aug 21 13:52  data


# Setup Spark

In [5]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-08-22 15:50:17--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-08-22 15:50:18--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-08-22 15:50:18 (34.9 MB/s) - written to stdout [1608/1608]

setup Colab for PySpark 3.1.2 and Spark NLP 3.2.1
Get:1 http://security.ubuntu.com/ubuntu 

In [6]:
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp

spark = sparknlp.start()

print("Spark version:", spark.version)
print("Spark NLP version:", sparknlp.version())

Spark version: 3.1.2
Spark NLP version: 3.2.1


# Read Data & Test Split Sentences with Custom Sentence Detector Model

In [18]:
df = spark.read.json(sourcePath + '1k_2019-02-19_oldp_cases_textout.json')

df.printSchema()
df.count()

root
 |-- id: long (nullable = true)
 |-- text_Content: string (nullable = true)



1000

In [19]:
document = DocumentAssembler().setInputCol("text") \
                              .setOutputCol("document")

sentence = SentenceDetectorDLModel.load(sourcePath + 'data/V1/1k_SentenceDetectorDLModel') \
                             .setInputCols("document") \
                             .setOutputCol("sentences")

finisher = Finisher() \
    .setInputCols(["sentences"]) \
    .setOutputCols(["sentences_string"]) \
    .setOutputAsArray(False) \
    .setCleanAnnotations(False)

test_pipeline = Pipeline(stages=[document, sentence, finisher])

In [20]:
sentencesDF = df.select('id','text_Content') \
                .withColumnRenamed('text_Content', 'text')

sentencesDF.show()

+------+--------------------+
|    id|                text|
+------+--------------------+
|188482|
Tenor
Als funkti...|
|188452|
Tenor
Der Antrag...|
|188446|
Tenor
Der Antrag...|
|188455|
Tenor
Der Antrag...|
|188454|
Tenor
Die Klage ...|
|188453|
Tenor
- 1.
Die a...|
|180230|
Tenor
Der Antrag...|
|180229|
Tenor
Der Antrag...|
|188461|
Tenor
Das Verfah...|
|188460|
Tenor
Auf die so...|
|188459|
Tenor
Die Beschw...|
|188458|
Tenor
Die Beschw...|
|188457|
Tenor
Die aufsch...|
|188456|
Tenor
Die aufsch...|
|188450|BeginnDoc
Tenor
D...|
|180231|
Tenor
Der Antrag...|
|188463|
Tenor
Der Antrag...|
|188462|
Tenor
Der Beschl...|
|188464|
Tenor
Das angefo...|
|188451|BeginnDoc
Tenor
D...|
+------+--------------------+
only showing top 20 rows



In [21]:
prediction_model = test_pipeline.fit(sentencesDF)
predDF = prediction_model.transform(sentencesDF)
predDF.printSchema()
predDF.show()

root
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentences: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value:

In [22]:
predDF.select('sentences').show(1, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
predDF.select('sentences_string').show(1, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Test Sentence Splitting

Note that some German sentences are split at abbreviations like:
- Abs.
- Aufl.
- i.S.d.
- gem.
- VVG 30.

Note that almost all abbreviations listed above occurr (in actual correct German) usually inside a sentence, but extremely seldom at the end of a sentence (eg. 'Abs.' and 'Aufl.' are followed by a referencing number).

So, wrongly splitting sentences along these abbreviations can have a negative impact on the quality of a downstream task, such as, [Named Entity Recognition](https://www.johnsnowlabs.com/named-entity-recognition-ner-with-bert-in-spark-nlp/) (NER), if we imagine that these abbreviations are part of law citation that would now (wrongly) be part of 2 sentences.

In [24]:
import pyspark.sql.functions as F

predDF.withColumn('print_sentences', F.regexp_replace(F.col('sentences_string'), "[@]", "\r\n@: ")).select('print_sentences').show(1, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------