In [1]:
import os

In [2]:
path_original = "..//data//external//cadec//original"
dir_list_original = os.listdir(path_original)
dir_list_original = sorted(dir_list_original, key = lambda x: (x.split('.')[0], int(x.split('.')[1])))
len(dir_list_original)

1250

In [3]:
path_text = "..//data//external//cadec//text"
dir_list_text = os.listdir(path_text)
dir_list_text = sorted(dir_list_text, key = lambda x: (x.split('.')[0], int(x.split('.')[1])))
len(dir_list_text)

1250

In [4]:
import pandas as pd

def process_original_to_df(path):
    cadec_df = pd.read_csv(path, sep='\t', names=['tag_number', 'entity_type_offsets', 'text'])
    cadec_df["entity_type"] = cadec_df.entity_type_offsets.str.split(n=1).str.get(0)        # Get only entity type from entity_type_offsets
    cadec_df["offsets"] = cadec_df.entity_type_offsets.str.split(n=1).str.get(1)            # Get only offsets from entity_type_offsets
    cadec_df = cadec_df.drop("entity_type_offsets", axis=1)                                 # Rename column
    cadec_df = cadec_df[ -cadec_df.tag_number.str.startswith('#')]                          # Remove annotation notes
    cadec_df = cadec_df.reset_index(drop=True)                                              # Reset index

    return cadec_df

In [5]:

FILE = 'ARTHROTEC.6'
PATH_ORIGINAL = f'..\\data\\external\\cadec\\original\\{FILE}.ann'
PATH_TEXT = f'..\\data\\external\\cadec\\text\\{FILE}.txt'

cadec_df = pd.read_csv(PATH_ORIGINAL, sep='\t', names=['tag_number', 'entity_type_offsets', 'text'])
cadec_df["entity_type"] = cadec_df.entity_type_offsets.str.split(n=1).str.get(0)        # Get only entity type from entity_type_offsets
cadec_df["offsets"] = cadec_df.entity_type_offsets.str.split(n=1).str.get(1)            # Get only offsets from entity_type_offsets
cadec_df = cadec_df.drop("entity_type_offsets", axis=1)                                 # Rename column
cadec_df = cadec_df[ -cadec_df.tag_number.str.startswith('#')]                          # Remove annotation notes
cadec_df = cadec_df.reset_index(drop=True)                                              # Reset index
cadec_df.head(5)

Unnamed: 0,tag_number,text,entity_type,offsets
0,T1,stomach pain,ADR,14 26
1,T2,slight nausea,ADR,32 45
2,T3,reflux,ADR,52 58
3,T6,abdominal cramps and pain,ADR,152 161;166 181
4,T4,abdominal gas,ADR,152 161;162 165


In [6]:
def create_list_of_entities(df):
    entities = []
    offsets = str(df.iloc[3]['offsets']).split(';')

    for id in range(df.shape[0]):
        offsets = str(df.iloc[id]['offsets']).split(';')
        
        for o in offsets:
            start = int(o.split(' ')[0])
            end = int(o.split(' ')[1])
            entities.append(
                {
                    "type": {
                        "tag": df.iloc[id]['entity_type'], 
                        "tag number": df.iloc[id]['tag_number']
                        },
                    "start": start,
                    "end": end,
                }
            )

    return sorted(entities, key=lambda d: (d['start'], d['end']))

In [7]:
entities = []
offsets = str(cadec_df.iloc[3]['offsets']).split(';')

for id in range(cadec_df.shape[0]):
    offsets = str(cadec_df.iloc[id]['offsets']).split(';')
    
    for o in offsets:
        start = int(o.split(' ')[0])
        end = int(o.split(' ')[1])
        entities.append(
            {
                "type": {
                    "tag": cadec_df.iloc[id]['entity_type'], 
                    "tag number": cadec_df.iloc[id]['tag_number']
                    },
                "start": start,
                "end": end,
           }
        )


entities = sorted(entities, key=lambda d: (d['start'], d['end']))

In [16]:
def create_datataset_of_word_tags(path, entities):
    
    words = []

    with open(path) as f:

        word_start = 0
        char_pos = 0
        word = ""
        labels = []
        
        while True:
            char = f.read(1)
        
            if not char:
                break

            if not (char.isalpha() or char.isdigit() or char == '\''):
                if word:

                    if not (word[0].isalpha() or word[0].isdigit()):
                        word = word[1:]

                    for entity in entities:
                        if (
                            word_start >= entity["start"]
                            and char_pos <= entity["end"]
                        ):
                            
                            labels.append(entity["type"])
                    if word_start != char_pos:
                        words.append(
                            {
                                "word": word,
                                "start": word_start,
                                "end": char_pos,
                                "label": labels
                            }
                        )

                    word = "" 
                    labels = []

                word_start = char_pos + 1

            word += char
            char_pos += 1

        f.close()

    return words

In [9]:
words = []

with open(PATH_TEXT) as f:

    word_start = 0
    char_pos = 0
    word = ""
    labels = []
    
    while True:
        char = f.read(1)
      
        if not char:
            break

        if not (char.isalpha() or char.isdigit() or char == '\''):
            if word:

                if not (word[0].isalpha() or word[0].isdigit()):
                    word = word[1:]

                for entity in entities:
                    if (
                        word_start >= entity["start"]
                        and char_pos <= entity["end"]
                    ):
                        
                        labels.append(entity["type"])

                if word_start != char_pos:
                    words.append(
                        {
                            "word": word,
                            "start": word_start,
                            "end": char_pos,
                            "label": labels
                        }
                    )

                word = "" 
                labels = []

            word_start = char_pos + 1

        word += char
        char_pos += 1

    f.close()

In [10]:
for word in words[30:40]:
    print(word)

{'word': 'cramps', 'start': 166, 'end': 172, 'label': [{'tag': 'ADR', 'tag number': 'T6'}]}
{'word': 'and', 'start': 173, 'end': 176, 'label': [{'tag': 'ADR', 'tag number': 'T6'}]}
{'word': 'pain', 'start': 177, 'end': 181, 'label': [{'tag': 'ADR', 'tag number': 'T6'}]}
{'word': 'would', 'start': 182, 'end': 187, 'label': []}
{'word': 'be', 'start': 188, 'end': 190, 'label': []}
{'word': 'with', 'start': 191, 'end': 195, 'label': []}
{'word': 'me', 'start': 196, 'end': 198, 'label': []}
{'word': 'all', 'start': 199, 'end': 202, 'label': []}
{'word': 'day', 'start': 203, 'end': 206, 'label': []}
{'word': 'I', 'start': 208, 'end': 209, 'label': []}


In [13]:
for file_original, file_text in zip(dir_list_original, dir_list_text):
    path_original = f'..\\data\\external\\cadec\\original\\{file_original}'
    path_text = f'..\\data\\external\\cadec\\text\\{file_text}'

    try:
        df_original = process_original_to_df(path_original)
        print(df_original)
        entities = create_list_of_entities(df_original)
        create_datataset_of_word_tags(path_text, entities)
    except IndexError as e:
        print(f'{e}')
    

  tag_number                   text entity_type  offsets
0         T1             bit drowsy         ADR     9 19
1         T2  little blurred vision         ADR    29 50
2         T3              Arthrotec        Drug   93 102
3         T5              arthritis     Disease  179 188
4         T6                  agony     Symptom  260 265
5         T4       gastric problems         ADR    62 78
6         T7                  pains     Symptom  412 417
7         T8       feel a bit weird         ADR  437 453
  tag_number                                          text entity_type  \
0         T1                                          pain     Symptom   
1         T2                                     heartburn         ADR   
2         T3                                        nausea         ADR   
3         T4                              voracious hunger         ADR   
4         T5  sharp unbearable cramping pains in lower gut         ADR   
5         T6                           pain

In [17]:
import json

def save_to_jsonl(data, file_path):
    with open(file_path, 'w+') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

In [18]:
for file_original, file_text in zip(dir_list_original, dir_list_text):
    path_original = f'..\\data\\external\\cadec\\original\\{file_original}'
    path_text = f'..\\data\\external\\cadec\\text\\{file_text}'

    try:
        df_original = process_original_to_df(path_original)
        print(file_original)
        entities = create_list_of_entities(df_original)
        word_tags = create_datataset_of_word_tags(path_text, entities)

        output_json_file_path = f'..\\data\\processed\\word tags format\\{file_text}.jsonl'

        save_to_jsonl(word_tags, output_json_file_path)
    except IndexError as e:
        print(e)

ARTHROTEC.1.ann
ARTHROTEC.2.ann
ARTHROTEC.3.ann
ARTHROTEC.4.ann
ARTHROTEC.5.ann
single positional indexer is out-of-bounds
ARTHROTEC.6.ann
ARTHROTEC.7.ann
ARTHROTEC.8.ann
single positional indexer is out-of-bounds
ARTHROTEC.9.ann
ARTHROTEC.10.ann
single positional indexer is out-of-bounds
ARTHROTEC.11.ann
single positional indexer is out-of-bounds
ARTHROTEC.12.ann
single positional indexer is out-of-bounds
ARTHROTEC.13.ann
ARTHROTEC.14.ann
single positional indexer is out-of-bounds
ARTHROTEC.15.ann
ARTHROTEC.16.ann
ARTHROTEC.17.ann
ARTHROTEC.18.ann
ARTHROTEC.19.ann
ARTHROTEC.20.ann
ARTHROTEC.21.ann
ARTHROTEC.22.ann
ARTHROTEC.23.ann
ARTHROTEC.24.ann
ARTHROTEC.25.ann
ARTHROTEC.26.ann
ARTHROTEC.27.ann
ARTHROTEC.28.ann
ARTHROTEC.29.ann
ARTHROTEC.30.ann
ARTHROTEC.31.ann
ARTHROTEC.32.ann
single positional indexer is out-of-bounds
ARTHROTEC.33.ann
single positional indexer is out-of-bounds
ARTHROTEC.34.ann
single positional indexer is out-of-bounds
ARTHROTEC.35.ann
ARTHROTEC.36.ann
ARTHROTEC.