### This file is parsing i2b2 2010 training data and annotating it with the CoNLL BIO scheme

In [1]:
import os
from nltk import pos_tag, RegexpParser
import pandas as pd
import numpy as np

In [2]:
!nltk.version

/bin/sh: 1: nltk.version: not found


In [3]:
a_ids = []
e_ids = []

for filename in os.listdir("./data/annotations"):
    if filename[0] != ".":  # ignore hidden files
        a_ids.append(int(filename))
for filename in os.listdir("./data/entries"):
    if filename[0] != ".": 
        e_ids.append(int(filename))
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Success: all anotations have a corresponding entry.", len(intersection))

Success: all anotations have a corresponding entry. 73


## Build corpora

In [4]:
# build annotation and entry corpora

a_corpus = []
e_corpus = []
entries_txt = []
entries_list = []

# only annotations and corresponding files
for file in a_ids:
    path = "./data/annotations/" + str(file)
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = "./data/entries/" + str(file)
    with open(path) as f:
        #content = f.readlines()
        file_read = f.read()
        entries_list.append((file, file_read))
        entries_txt.append(file_read)
        
        content = file_read.splitlines()
        e_corpus.append(content)
    

In [5]:
e_corpus[0][:30]

['Admission Date :',
 '2018-10-25',
 'Discharge Date :',
 '2018-10-31',
 'Date of Birth :',
 '1951-06-15',
 'Sex :',
 'M',
 'Service : ',
 'CARDIOTHORACIC',
 'Allergies :',
 'Patient recorded as having No Known Allergies to Drugs',
 'Attending : Michael D. Christensen , M.D.',
 'Chief Complaint :',
 'Shortness of Breath',
 'Major Surgical or Invasive Procedure :',
 'Coronary Artery Bypass Graft x3 ( Left internal mammary -> left anterior descending , saphaneous vein graft -> obtuse marginal , saphaneous vein graft -> posterior descending artery ) 2018-10-25',
 'History of Present Illness :',
 '67 y/o male with worsening shortness of breath. Had abnormal ETT and referred for cath .',
 'Cath revealed severe 3 vessel disease .',
 'Then referred for surgical intervention .',
 'Past Medical History :',
 "Carpal tunnel syndrome , Hypertension , Hyperlipidemia , Arthritis , h/o Bell's Palsy , HOH , s/p Tonsillectomy",
 'Social History :',
 'Denies ETOH , rare Tobacco .',
 'Electrician .',
 'F

# POS_tagger with SparkNLP

In [6]:
import json
import os
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.sql.window import Window
from pyspark.sql import SparkSession


In [7]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP - CoNLL parser") \
        .master("local[*]") \
        .config("spark.driver.memory", "10G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.4.3")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.3")

    return builder.getOrCreate()

In [8]:
import os
os.environ['PYSPARK_PYTHON'] = '/home/francesco/anaconda3/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/francesco/anaconda3/bin/python'

In [9]:
spark = start()

In [10]:
# to connect to the spark cluster
# spark = SparkSession.builder \
# .master('spark://path/to/spark/cluster)') \
# .appName('Spark NLP').getOrCreate()

In [11]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.3
Apache Spark version:  2.4.5


In [12]:
def get_ann_pipeline():
    
    document_assembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol('document')

    sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')\
        .setCustomBounds(['\n'])\
        .setUseCustomBoundsOnly(True)

    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")
    
    normalizer = Normalizer() \
        .setInputCols(["token"]) \
        .setOutputCol("normalized")\
        .setLowercase(True)\
        .setCleanupPatterns(["[.]"])

    pos = PerceptronModel.pretrained() \
        .setInputCols(["sentence", "normalized"]) \
        .setOutputCol("pos")
    
    pipeline = Pipeline(
        stages = [
            document_assembler,
            sentence,
            tokenizer,
            normalizer,
            pos
        ]
    )

    #empty_data = spark.createDataFrame([[""]]).toDF("text")

    #pipelineFit = pipeline.fit(empty_data)
    
    #lp_pipeline = LightPipeline(pipelineFit)

    print ("Spark NLP Pipeline is created")

    return pipeline

In [13]:
df = pd.DataFrame(entries_list, columns=['id', 'text'])

In [14]:
df.head()

Unnamed: 0,id,text
0,13,Admission Date :\n2018-10-25\nDischarge Date :...
1,14,Admission Date :\n2011-03-10\nDischarge Date :...
2,15,Admission Date :\nDischarge Date :\n2014-01-24...
3,16,Admission Date :\n2015-10-28\nDischarge Date :...
4,17,Admission Date:\n2011-02-08\nDischarge Date :\...


In [15]:
data = spark.createDataFrame(df).toDF('id', 'text')

In [16]:
conll_pipeline = get_ann_pipeline()

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]
Spark NLP Pipeline is created


In [17]:
pipeline = conll_pipeline.fit(data)

In [18]:
parsed = pipeline.transform(data)

In [19]:
parsed_pos_tag = parsed.select('id', F.explode(F.arrays_zip('normalized.result', 'pos.result', 'normalized.begin', 'normalized.end')).alias("cols")) \
                        .select(F.expr("id"),
                                F.expr("cols['0']").alias("normalized"),
                                F.expr("cols['1']").alias("pos_tag"),
                                F.expr("cols['2']").alias("normalized_begin"),
                                F.expr("cols['3']").alias("normalized_end"))

In [20]:
token_end_list = parsed_pos_tag.select('id', 'normalized_end').withColumn('row', F.lit(0)).toPandas().values.tolist()

In [21]:
parsed_sentence = parsed.select('id', F.explode(F.arrays_zip('sentence.result', 'sentence.begin','sentence.end')).alias('cols'))\
                                        .select('id',
                                                F.expr("cols['0']").alias("sentence"),
                                                F.expr("cols['1']").alias("begin"),
                                                F.expr("cols['2']").alias("end"))

In [22]:
windowSpec = Window.partitionBy(parsed_sentence['id']).orderBy(parsed_sentence['id'])

In [23]:
parsed_sentence_row = parsed_sentence.withColumn('row', F.row_number().over(windowSpec)).orderBy('id', 'row')

In [24]:
parsed_sentence_row.select('sentence', 'begin', 'end' ).show(20, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---+
|sentence                                                                                                                                                                                          |begin|end|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---+
|Admission Date :                                                                                                                                                                                  |0    |15 |
|2018-10-25                                                                                                                                                                 

In [25]:
parsed_pos_df = parsed_pos_tag.toPandas()

In [26]:
sentence_end_list = parsed_sentence_row.select('id', 'end', 'row').toPandas().values.tolist()

In [27]:
id_counter_token = 0

for i in range(len(sentence_end_list)):
    
    while(id_counter_token < len(token_end_list) and token_end_list[id_counter_token][0] == sentence_end_list[i][0]):
        if(token_end_list[id_counter_token][1] <= sentence_end_list[i][1]):
            token_end_list[id_counter_token][2] = sentence_end_list[i][2]
            id_counter_token += 1
            
        else:
            break

In [28]:
row_df = pd.DataFrame(token_end_list, columns= ['id', 'normalized_end', 'row'])

In [29]:
parsed_pos_tag_merged = parsed_pos_df.merge(row_df, on = ['id', 'normalized_end'], how='left')

In [30]:
parsed_pos_tag_merged['offset'] = parsed_pos_tag_merged.groupby(['id', 'row']).cumcount()

In [31]:
parsed_pos_tag_merged[450:500]
#parsed_pos_tag_final[-10000:-9950]
#parsed_pos_tag_final[-10800:-10750]
#parsed_pos_tag_merged[70:120]

Unnamed: 0,id,normalized,pos_tag,normalized_begin,normalized_end,row,offset
450,13,2018-10-25,JJ,2526,2535,62,0
451,13,11:15,NN,2537,2541,62,1
452,13,am,VBP,2543,2544,62,2
453,13,blood,NN,2546,2550,62,3
454,13,pt,NN,2552,2553,62,4
455,13,-,:,2555,2555,62,5
456,13,133,CD,2557,2559,62,6
457,13,*,NN,2562,2562,62,7
458,13,ptt,NN,2564,2566,62,8
459,13,-,:,2568,2568,62,9


# NER_tagger on i2b2 dataset

In [32]:
#docs = list(nlp.pipe(entries_txt, disable=["tagger", "parser", "ner"]))

## Set up dataframe

In [33]:
entries_cols = ["id", "row", "offset", "token"]
entries_df = pd.DataFrame(columns=entries_cols)
entries_df.head()

Unnamed: 0,id,row,offset,token


In [34]:
annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset,length


## Build annotations data frame

In [35]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # reset df
tmp_list = []

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        # print(row, "\n")
        
        label = row[1].split("=")[1][1:-1] # label = treatment
        
        tag = row[0].split("=")
        if ":" in tag[1]:
            tag_row_a = tag[1].split(" ")[-2:][0].split(":")[0] # line_a i.e. 80
            tag_row_b = tag[1].split(" ")[-2:][1].split(":")[0] # line_b i.e. 80

            # some annotations have non-standard formatting (losing 64 instances)
            try:
                tag_offset_a = int(tag[1].split(" ")[-2:][0].split(":")[1]) # word_a i.e. 4
                tag_offset_b = int(tag[1].split(" ")[-2:][1].split(":")[1]) # word_b i.e. 7
                length = tag_offset_b - tag_offset_a + 1

                # 1 row = 1 token with a tag
                first = True
                BIO_tag = "B-"
                if length > 1 and tag_row_a == tag_row_b:
                    for offset in range(tag_offset_a, tag_offset_b+1):
                        if first: 
                            tag_label = BIO_tag + label
                            first = False
                        else:
                            tag_label = tag_label.replace("B-", "I-")
                        tmp_list.append([a_ids[i], tag_label, tag_row_a, offset, 1])
                # TODO: tags over line breaks
                else:
                    tmp_list.append([a_ids[i], BIO_tag + label, tag_row_a, tag_offset_a, length])
            except:
                pass             

annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df.reset_index(inplace=True)
                        

In [36]:
annotations_df = annotations_df.drop(columns=["index", "length"])
annotations_df.shape

(20848, 4)

In [37]:
annotations_df.head(10)

Unnamed: 0,id,NER_tag,row,offset
0,13,B-test,65,25
1,13,B-treatment,115,4
2,13,I-treatment,115,5
3,13,I-treatment,115,6
4,13,I-treatment,115,7
5,13,B-problem,115,0
6,13,I-problem,115,1
7,13,I-problem,115,2
8,13,B-treatment,84,18
9,13,B-treatment,69,2


## Joing entries and annotations

In [38]:
# ensure correct dtypes
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)

In [39]:
parsed_pos_tag_merged[['id', 'row', 'offset']] = parsed_pos_tag_merged[['id', 'row', 'offset']].apply(pd.to_numeric)
parsed_pos_tag_merged["normalized"] = parsed_pos_tag_merged["normalized"].astype(str)

In [40]:
result_df = pd.merge(parsed_pos_tag_merged, annotations_df, how="left", on=['id', 'row', 'offset'])

In [41]:
# replace NaNs with "O"
print("columns with missing data:\n", result_df.isna().any())
result_df = result_df.fillna("O")

columns with missing data:
 id                  False
normalized          False
pos_tag             False
normalized_begin    False
normalized_end      False
row                 False
offset              False
NER_tag              True
dtype: bool


In [42]:
print("columns with missing data:\n", result_df.isna().any())

columns with missing data:
 id                  False
normalized          False
pos_tag             False
normalized_begin    False
normalized_end      False
row                 False
offset              False
NER_tag             False
dtype: bool


In [43]:
result_df.shape

(84726, 8)

In [44]:
indices = result_df.loc[result_df['offset'] == 0].index.tolist() # get your blank rows.
rows_ = dict.fromkeys(result_df.columns.tolist(),'') 

In [45]:
result_df_new = pd.DataFrame(np.insert(result_df.values, [x for x in indices],
                   values=list(rows_.values()), 
                   axis=0), columns=rows_.keys())

In [46]:
result_df_final = result_df_new [['normalized', 'pos_tag', 'pos_tag', 'NER_tag']]
result_df_final.columns = ['normalized', 'pos_tag', 'chunk_tag', 'NER_tag']

In [47]:
result_df_final[:50]

Unnamed: 0,normalized,pos_tag,chunk_tag,NER_tag
0,,,,
1,admission,NN,NN,O
2,date,NN,NN,O
3,:,:,:,O
4,,,,
5,2018-10-25,NN,NN,O
6,,,,
7,discharge,NN,NN,O
8,date,NN,NN,O
9,:,:,:,O


### insert -DOCSTART- row

In [48]:
first_row = []
first_row.insert(0, {'normalized': '-DOCSTART-', 'pos_tag': '-X-', 'chunk_tag': 'O', 'NER_tag' : 'O'})

In [49]:
result = pd.concat([pd.DataFrame(first_row), result_df_final], ignore_index=True)

In [54]:
result[20:50]

Unnamed: 0,normalized,pos_tag,chunk_tag,NER_tag
20,,,,
21,sex,NN,NN,O
22,:,:,:,O
23,,,,
24,m,NN,NN,O
25,,,,
26,service,NN,NN,O
27,:,:,:,O
28,,,,
29,cardiothoracic,NN,NN,O


In [51]:
ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

10295 named entities


In [None]:
result_df_final.to_csv('./conll.train', sep= ' ', index=False, header=None)  

In [53]:
spark.stop()