In [2]:
import pandas as pd
import numpy as np
from data.relation import read_csv, read_json, write_list_to_file, write_json_lists_to_file
import re
import json

In [16]:
filename = 'data/water/dev.tsv'
datadf = pd.read_csv(filename, sep='\t', names=['e1', 'relationship', 'e2', 'text'])

def process_data(row):
    e1 = row['e1']
    e2 = row['e2']
    text = row['text']
    entities = sorted([(e1, "e1"), (e2, "e2")], key=lambda x: len(x[0]), reverse=True)
    for entity, tag in entities:
        text = re.sub(re.escape(entity), f"<{tag}>{entity}</{tag}>", text, count=1)  # Count=1 ensures only the first occurrence is replaced
    return text

datadf['text_formatted'] = datadf.apply(process_data, axis=1)

datadf['id'] = range(len(datadf))

In [17]:
json_str = datadf.to_dict(orient='records')
# Write the JSON string to a .json file
f = 'data/water/dev_initial.json'
with open(f, 'w') as f:
    json.dump(json_str, f, ensure_ascii=False, indent=4)

In [56]:
json_str[0]

'{'

In [19]:
counter = 0
filename = 'data/water/dev_initial.json'
json_list = read_json(filename=filename)
for id, row in enumerate(json_list):
    id = row['id']
    # print(id)
    e1 = re.search(r'<e1>(.*?)</e1>', row['text_formatted']).group(1)
    e2 = re.search(r'<e2>(.*?)</e2>', row['text_formatted']).group(1)
    if not e1: print(f"no e1: {id}")
    if not e2: print(f'no e2: {id}')
    
    if '<e2' in e1 or '<e1' in e1:
        print(f'First problem: {id}')
        
    if '<e1' in e2 or '<e2'in e2:
        print(f"second problem: {id}")
        

### Data preparation for training

In [5]:
data_type = ['dev', 'test', 'train']

relation_set = set()
for dt in data_type:
    initial_filename = f"data/water/{dt}_initial.json"
    json_list = read_json(filename=initial_filename)
    for js in json_list:
        relation_set.add(js['relationship'])
        
relation2id = {r:i for i,r in enumerate(list(relation_set))}
# write_json_lists_to_file('data/water/dev', relation2id)
print(relation2id)
for dt in data_type:
    initial_filename = f"data/water/{dt}_initial.json"
    json_list = read_json(filename=initial_filename)
    
    text_list = []
    label_list = []
    
    for js in json_list:
        text_list.append(js['text_formatted'])
        label_id = relation2id[js['relationship']]
        assert label_id != None
        assert label_id != ''
        label_list.append(label_id)
        
    write_json_lists_to_file(f'data/water/{dt}/relation2id.json', relation2id)
    write_list_to_file(f'data/water/{dt}/{dt}_sentence.json', text_list)
    write_list_to_file(f'data/water/{dt}/{dt}_label_id.json', label_list)
        
    
    

{'interacts with': 0, 'associated with': 1, 'diagnoses': 2, 'predisposes': 3, 'augments': 4, 'location of': 5, 'manifestation of': 6, 'is a': 7, 'prevents': 8, 'occurs': 9, 'part of': 10, 'risk to': 11, 'to': 12, 'disrupts': 13, 'produces': 14, 'coexists with': 15, 'threaten': 16, 'type of': 17, 'impacts': 18, 'process of': 19, 'affects': 20, 'treats': 21, 'cause': 22}
