### Network-Data

Data Preparation

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from time import time

In [18]:
% matplotlib inline
import json

### Get all data

#### Get T and E and R data : 

In [19]:
"""============================================================
This class helps to read data from 'sample.ann' files to create
nodes and edges for network and graph analysis

By: Sigdel D.       Data-2018-Jan-10
==============================================================="""



class DataReader(object):
    
    """Llist: list of lines of files
       t_dict :  dictionary of T-terms
       e_dict : dictionary of e-terms
       r_dict : dictionary of r-terms
       Nodes: List of nodes
       Edges : List of edges"""
    
    
    def __init__(self,input_file):
        #Input file shoulsd be provided
        self.input_file = input_file
        self.Llist = []
        self.t_dict = {}  
        self.e_dict = {}
        self.r_dict = {}
        self.node_dict ={}
        self.Nodes  =[]
        self.Edges = []
        
        
        self.group = ['Coreference', 'Severity', 'Occupation', 'Other_event',\
                      'Frequency', 'Gender', 'Biological_structure', 'Volume',\
                      'Biological_attribute', 'Age', 'Sign_symptom', 'Date',\
                      'History', 'Time', 'Shape', 'Lab_value', 'Nonbiological_location',\
                      'Administration', 'Distance', 'Other_entity', 'Height',\
                      'Activity', 'Family_history', 'Dosage', 'Ethnicity',\
                      'Subject', 'Detailed_description', 'Qualitative_concept',\
                      'Texture', 'Area', 'Weight', 'Quantitative_concept', \
                      'Therapeutic_procedure', 'Disease_disorder', 'Duration',\
                      'Color', 'Medication', 'Diagnostic_procedure']
        
        
        self.relations = ['BEFORE','MODIFY','SUB_PROCEDURE','IDENTICAL','AFTER',\
                        'OVERLAP', 'CAUSE','CO-OCCURANCE','DECREASE_TO',\
                        'DECREASE_FROM', 'INCREASE_FROM','INCREASE_TO']
        
        
        
        
    # function to sum charcters in the list    
    def sum_chr_items(self, items):
        st =""
        for item in items:
            st = st+" "+item
        return st
    
    
    
    # Line of text is converted to list
    def line_to_list(self):
        with open(self.input_file) as f:
            for line in f:
                self.Llist.append(line.split())
                
                
                
                
    def find_group(self,g):
        for k,item in enumerate(self.group):
            if item == g:
                return(k+1)
            
            
                
    def find_edg_value(self,eg):
        for k,item in enumerate(self.relations):
            if item == eg:
                return(2+(k+1))
            
            
    def set_tdata(self):
        for item in self.Llist:
            if item[0][0] =="T":
                t_key = item[0]
                t_type = item[1]
                t_val = self.sum_chr_items(item[4:])
                
                self.t_dict.update({t_key:[t_type,t_val]})
                
                self.node_dict.update({t_key:[t_type,t_val]})
                
                self.Nodes.append({"id":t_key,\
                              "type":t_type,\
                              "group":self.find_group(t_type),\
                              "name":t_val})
               
                    
                
                
                
    def set_edata(self):
        for item in self.Llist:            
            if item[0][0] =="E":
                t_key = item[1].split(':')[1]
                e_key = item[0]
                e_type = item[1].split(':')[0]
                e_val = self.t_dict[t_key][1]
                
                
                self.e_dict.update({e_key:[e_type,e_val]})
                
                self.node_dict.update({e_key:[e_type,e_val]})  
                
                for item in self.Nodes:
                    if item["id"] ==  t_key :
                        self.Nodes.remove(item)
                
                self.Nodes.append({"id":e_key,\
                            "type":e_type,\
                            "group":self.find_group(e_type),\
                            "name":e_val})
                
                
                
    def set_rdata(self):
        for item in self.Llist:           
            if item[0][0] =="R":
                r_key = item[0]
                rln = item[1]
                fm_id = item[2].split(":")[1]
                to_id = item[3].split(":")[1]
                fm_type = self.node_dict[fm_id][0]
                to_type = self.node_dict[to_id][0]
                fm_name = self.node_dict[fm_id][1]
                to_name = self.node_dict[to_id][1]
                
                self.r_dict.update({r_key:[rln,fm_id,to_id]}) 

                self.Edges.append({"id":r_key,\
                            "name":rln,\
                            "value":self.find_edg_value(rln),\
                            "source":fm_id,\
                            "source_name":fm_name,\
                            "source_group":self.find_group(fm_type),\
                            "target":to_id,\
                            "target_name":to_name,\
                            "target_group":self.find_group(to_type)})
                
                
                
                
    def set_sdata(self):
        for item in self.Llist: 
            if item[0][0] !="E":
                if item[0][0] !="R":
                    if item[0][0] !="T":
                        if item[0][0] =="*":
                            r_key = "ST"
                            rln = item[1]
                            ids = item[2:]
                            for  i in range(len(ids)):
                                for j in range(i,len(ids)):
                                    if i != j:
                                        fm_id  = ids[i]
                                        to_id = ids[j]
                                        fm_type = self.node_dict[fm_id][0]
                                        to_type = self.node_dict[to_id][0]
                                        fm_name = self.node_dict[fm_id][1]
                                        to_name = self.node_dict[to_id][1]
                                        
                                        
                                        self.Edges.append({"id":r_key,\
                                                    "name":rln,\
                                                    "value": self.find_edg_value(rln),\
                                                    "source":fm_id,\
                                                    "source_name":fm_name,\
                                                    "source_group":self.find_group(fm_type),\
                                                    "target":to_id,\
                                                    "target_name":to_name,\
                                                    "target_group":self.find_group(to_type)})
                
                
    
                
    def data_dumper(self,file_name,data):
            with open(file_name, 'w') as fp:
                json.dump(data, fp)
                
    
     
    def get_nodes(self,dump=False):
        self.line_to_list()
        self.set_tdata()
        self.set_edata()
        if dump:
            self.data_dumper(file_name = 'nodes.json',data = self.Nodes)
        return self.Nodes
    
    
    
    def get_edges(self, dump=False):
        self.set_rdata()
        self.set_sdata()
        if dump:
            self.data_dumper(file_name = 'edges.json',data = self.Edges)
        return self.Edges
    
    
    def get_data(self,fname,dump=False):
        all_data = {}
        nodes = self.get_nodes()
        edges = self.get_edges()
        
        all_data.update({'nodes':nodes,'links':edges })
        
        
        
        if dump:
            self.data_dumper(file_name = fname,data = all_data) 
        
        return all_data

In [20]:
PMID = "18561524"
DR = DataReader(PMID+".ann")
DR.get_data(PMID+".json", dump=True)

{'links': [{'id': 'R2',
   'name': 'MODIFY',
   'source': 'T8',
   'source_group': 16,
   'source_name': ' 40',
   'target': 'E4',
   'target_group': 38,
   'target_name': ' body mass index',
   'value': 4},
  {'id': 'R3',
   'name': 'MODIFY',
   'source': 'T32',
   'source_group': 16,
   'source_name': ' 7.45',
   'target': 'E22',
   'target_group': 38,
   'target_name': ' pH',
   'value': 4},
  {'id': 'R4',
   'name': 'MODIFY',
   'source': 'T36',
   'source_group': 16,
   'source_name': ' 3.55 kPa',
   'target': 'E23',
   'target_group': 38,
   'target_name': ' pCO2',
   'value': 4},
  {'id': 'R5',
   'name': 'MODIFY',
   'source': 'T37',
   'source_group': 16,
   'source_name': ' 7.76 kPa',
   'target': 'E24',
   'target_group': 38,
   'target_name': ' paO2',
   'value': 4},
  {'id': 'R6',
   'name': 'MODIFY',
   'source': 'T38',
   'source_group': 16,
   'source_name': ' −5.1 mmol/l',
   'target': 'E25',
   'target_group': 38,
   'target_name': ' BE',
   'value': 4},
  {'id': 'R7'

In [21]:
import pandas as pd

In [22]:
Nodes = pd.DataFrame(DR.get_nodes(dump=True))

In [23]:
Nodes

Unnamed: 0,group,id,name,type
0,10,T1,18-year-old,Age
1,6,T2,male,Gender
2,16,T8,40,Lab_value
3,13,T9,overweight,History
4,24,T13,900 mg daily,Dosage
5,24,T17,54 mg daily,Dosage
6,17,T21,local hospital,Nonbiological_location
7,2,T22,severe,Severity
8,16,T29,120/80 mmHg,Lab_value
9,16,T32,7.45,Lab_value


In [24]:
Edges = pd.DataFrame(DR.get_edges(dump =True))

In [25]:
Edges

Unnamed: 0,id,name,source,source_group,source_name,target,target_group,target_name,value
0,R2,MODIFY,T8,16,40,E4,38,body mass index,4
1,R3,MODIFY,T32,16,7.45,E22,38,pH,4
2,R4,MODIFY,T36,16,3.55 kPa,E23,38,pCO2,4
3,R5,MODIFY,T37,16,7.76 kPa,E24,38,paO2,4
4,R6,MODIFY,T38,16,−5.1 mmol/l,E25,38,BE,4
5,R7,SUB_PROCEDURE,E22,38,pH,E21,38,arterial blood gas,5
6,R8,SUB_PROCEDURE,E23,38,pCO2,E21,38,arterial blood gas,5
7,R9,SUB_PROCEDURE,E24,38,paO2,E21,38,arterial blood gas,5
8,R10,SUB_PROCEDURE,E25,38,BE,E21,38,arterial blood gas,5
9,R11,MODIFY,T40,28,not elevated,E26,38,C-reactive protein,4
