In [1]:
import pandas as pd
import numpy as np
import re
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('wordnet')
nltk.help.upenn_tagset('NNP')
nltk.help.upenn_tagset('NN')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to /home/kurubal/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to /home/kurubal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kurubal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/kurubal/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/kurubal/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kurubal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
spark = sparknlp.start()
spark

In [None]:
try:
  with open("tr1.txt", "r", encoding="utf-8") as file:
    text_data = file.read()
except:
  print("There is not such a file  or path is incorrect")

In [None]:
text_data_clean_brackets = re.sub('[\(\[\{].*?[\)\]\}]', '', text_data)

In [None]:
custom_char = ["-","#",":","~","$","*","/","+"]
for i in custom_char:
    text_data_clean_brackets = text_data_clean_brackets.replace(i, '')

In [None]:
text = re.sub(r"\'", "", string=text_data_clean_brackets)

# START

In [1]:
!pip -q install pyspark==3.1.2 spark-nlp

[K     |████████████████████████████████| 212.4 MB 64 kB/s 
[K     |████████████████████████████████| 130 kB 54.6 MB/s 
[K     |████████████████████████████████| 198 kB 61.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from pyspark.ml import Pipeline

In [6]:
from pyspark.sql import SparkSession

builder = SparkSession.builder\
        .appName("Spark NLP Licensed")\
        .master("local[*]")\
        .config("spark.driver.memory", "24G")\
        .config("spark.driver.maxResultSize", "2048GB")\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "2000M")\
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2")

spark = builder.getOrCreate()

In [7]:
spark

In [None]:
%%bash

for i in {0..10}
do
 wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/oncology_notes/mt_oncology_$i.txt -P oncology_notes
done

## Reading the divided text file as a spark dataframe

In [None]:
textFiles = spark.sparkContext.wholeTextFiles("oncology_notes/*")

df = textFiles.toDF(schema=['path','text'])
df.show()

+--------------------+--------------------+
|                path|                text|
+--------------------+--------------------+
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
|file:/content/onc...|Sample Type / Med...|
+--------------------+--------------------+



In [None]:
df.select("text").head(1)

[Row(text='Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nDischarge Summary - Mesothelioma - 1\nDescription:\nMesothelioma, pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.\n(Medical Transcription Sample Report)\nPRINCIPAL DIAGNOSIS:\nMesothelioma.\nSECONDARY DIAGNOSES:\nPleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.\nPROCEDURES\n1. On August 24, 2007, decortication of the lung with pleural biopsy and transpleural fluoroscopy.\n2. On August 20, 2007, thoracentesis.\n3. On August 31, 2007, Port-A-Cath placement.\nHISTORY AND PHYSICAL:\nThe patient is a 41-year-old Vietnamese female with a nonproductive cough that started last week. She has had right-sided chest pain radiating to her back with fever starting yesterday. She has a history of pericarditis and pericardectomy in May 2006 and developed cough with right-sided chest pain, and

## Repartitioning by Dataframe

Selecting the number of repartition depends on your sources. Increasing so high doesn't mean make faster.

In [None]:
df.rdd.getNumPartitions()

2

In [None]:
df = df.repartition(200)

In [None]:
df.rdd.getNumPartitions()

200

In [8]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "xx") \
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

model = PipelineModel(stages=[documenter, sentencerDL])

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]


In [None]:
result = model.transform(df)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+
|                path|                text|            document|           sentences|
+--------------------+--------------------+--------------------+--------------------+
|file:/content/onc...|Sample Type / Med...|[{document, 0, 35...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 51...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 26...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 56...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 44...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 52...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 23...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document, 0, 39...|[{document, 0, 54...|
|file:/content/onc...|Sample Type / Med...|[{document,

In [None]:
pd_res = result.select("path","sentences.result").toPandas()
pd_res

Unnamed: 0,path,result
0,file:/content/oncology_notes/mt_oncology_3.txt,[Sample Type / Medical Specialty:\nHematology ...
1,file:/content/oncology_notes/mt_oncology_0.txt,[Sample Type / Medical Specialty:\nHematology ...
2,file:/content/oncology_notes/mt_oncology_1.txt,[Sample Type / Medical Specialty:\nHematology ...
3,file:/content/oncology_notes/mt_oncology_2.txt,[Sample Type / Medical Specialty:\nHematology ...
4,file:/content/oncology_notes/mt_oncology_4.txt,[Sample Type / Medical Specialty:\nHematology ...
5,file:/content/oncology_notes/mt_oncology_6.txt,[Sample Type / Medical Specialty:\nHematology ...
6,file:/content/oncology_notes/mt_oncology_8.txt,[Sample Type / Medical Specialty:\nHematology ...
7,file:/content/oncology_notes/mt_oncology_9.txt,[Sample Type / Medical Specialty:\nHematology ...
8,file:/content/oncology_notes/mt_oncology_5.txt,[Sample Type / Medical Specialty:\nHematology ...
9,file:/content/oncology_notes/mt_oncology_7.txt,[Sample Type / Medical Specialty:\nHematology ...


In [None]:
import pandas as pd
pd.set_option("display.max_colwidth", 0)

In [None]:
pd_res.loc[0].result

['Sample Type / Medical Specialty:\nHematology - Oncology',
 'Sample Name:\nParathyroid Adenoma Excision',
 'Description:\nExcision of right superior parathyroid adenoma, seen on sestamibi parathyroid scan and an ultrasound.',
 '(Medical Transcription Sample Report)',
 'PREOPERATIVE DIAGNOSIS:\nRight superior parathyroid adenoma.',
 'POSTOPERATIVE DIAGNOSIS:\nRight superior parathyroid adenoma.',
 'PROCEDURE:\nExcision of right superior parathyroid adenoma.',
 'ANESTHESIA:\nLocal with 1% Xylocaine and anesthesia standby with sedation.',
 'CLINICAL HISTORY:\nThis 80-year-old woman has had some mild dementia.',
 'She was begun on Aricept but could not tolerate that because of strange thoughts and hallucinations.',
 'She was found to be hypercalcemic.',
 'Intact PTH was mildly elevated.',
 'A sestamibi parathyroid scan and an ultrasound showed evidence of a right superior parathyroid adenoma.',
 'FINDINGS AND PROCEDURE:\nThe patient was placed on the operating table in the supine position

In [None]:
sent_list = []
for anno in sd_model.fullAnnotate(text)[0]["sentences"]:
     sent_list.append(anno.result)
#print(sent_list)

In [None]:
sentence_lower =  []
for i in sent_list:
    sentence_lower.append(i.lower())

In [None]:
sent_token_df = pd.DataFrame(sentence_lower)

In [None]:
sent_token_df = sent_token_df.rename(columns={0:"sentence"})

In [None]:
sent_token_df.sentence = sent_token_df.sentence.apply(lambda x: re.sub(pattern="[^\w\s]", repl="", string=x))

In [None]:
def space(sentence):
    word_var = word_tokenize(sentence)
    sent_var = " ".join(word_var)
    return sent_var

In [None]:
sent_token_df.sentence = sent_token_df.sentence.apply(lambda x : space(x))

In [None]:
sent_token_df.drop(sent_token_df[sent_token_df.sentence == ""].index, inplace=True)

In [None]:
sent_count= sent_token_df.sentence.value_counts().sort_values(ascending=False)

In [None]:
sent_count_df = pd.DataFrame(sent_count).reset_index()
sent_count_df.rename(columns={"index":"sentence", "sentence":"frequency"}, inplace=True)

In [None]:
total_frequency = sent_count_df.frequency.sum()

In [None]:
sent_count_df["ratio"] = (sent_count_df.frequency/total_frequency)*100

In [None]:
sent_count_df["cumul_ratio"] = np.cumsum(sent_count_df["ratio"])

In [None]:
def sentence_lenght(sentence):
    word_var2 = word_tokenize(sentence)
    if len(word_var2) <= 10:
        sent_var2 = " ".join(word_var2)
        return sent_var2
    else:
        return "sentence is bigger than ten word"

In [None]:
sent_count_df.sentence = sent_count_df.sentence.apply(sentence_lenght)

In [None]:
sent_count_df.drop(sent_count_df[sent_count_df.sentence == "sentence is bigger than ten word"].index,axis=0, inplace=True)
sent_count_df.reset_index(drop=True, inplace=True)

In [None]:
sent_count_df.to_csv("Sentence_Tokenize1.csv", index=False)