### Network-Data

Data Preparation

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from time import time

In [11]:
% matplotlib inline
import json

### Get all data

In [12]:
import glob
ann_files =  glob.glob("../data/*.ann")
len(ann_files)

200

#### Get T and E and R data : 

In [28]:
"""============================================================
This class helps to read data from 'sample.ann' files to create
nodes and edges for network and graph analysis

By: Sigdel D.       Data-2018-Jan-10
==============================================================="""



class DataReader(object):
    
    """Llist: list of lines of files
       t_dict :  dictionary of T-terms
       e_dict : dictionary of e-terms
       r_dict : dictionary of r-terms
       Nodes: List of nodes
       Edges : List of edges"""
    
    
    def __init__(self,input_file):
        #Input file shoulsd be provided
        self.input_file = input_file
        self.Llist = []
        self.t_dict = {}  
        self.e_dict = {}
        self.r_dict = {}
        self.node_dict ={}
        self.Nodes  =[]
        self.Edges = []
        
        
        self.group = ['Coreference', 'Severity', 'Occupation', 'Other_event',\
                      'Frequency', 'Gender', 'Biological_structure', 'Volume',\
                      'Biological_attribute', 'Age', 'Sign_symptom', 'Date',\
                      'History', 'Time', 'Shape', 'Lab_value', 'Nonbiological_location',\
                      'Administration', 'Distance', 'Other_entity', 'Height',\
                      'Activity', 'Family_history', 'Dosage', 'Ethnicity',\
                      'Subject', 'Detailed_description', 'Qualitative_concept',\
                      'Texture', 'Area', 'Weight', 'Quantitative_concept', \
                      'Therapeutic_procedure', 'Disease_disorder', 'Duration',\
                      'Color', 'Medication', 'Diagnostic_procedure']
        
        
        self.relations = ['BEFORE','MODIFY','SUB_PROCEDURE','IDENTICAL','AFTER'\
                        'OVERLAP', 'CAUSE','CO-OCCURANCE','DECREASE_TO',\
                        'DECREASE_FROM', 'INCREASE_FROM','INCREASE_TO']
        
        
        
        
    # function to sum charcters in the list    
    def sum_chr_items(self, items):
        st =""
        for item in items:
            st = st+" "+item
        return st
    
    
    
    # Line of text is converted to list
    def line_to_list(self):
        with open(self.input_file) as f:
            for line in f:
                self.Llist.append(line.split())
                
                
                
                
    def find_group(self,g):
        for k,item in enumerate(self.group):
            if item == g:
                return(k+1)
            
            
                
    def find_edg_value(self,eg):
        for k,item in enumerate(self.relations):
            if item == eg:
                return(2+(k+1))
            
            
    def set_tdata(self):
        for item in self.Llist:
            if item[0][0] =="T":
                t_key = item[0]
                t_type = item[1]
                t_val = self.sum_chr_items(item[4:])
                
                self.t_dict.update({t_key:[t_type,t_val]})
                
                self.node_dict.update({t_key:[t_type,t_val]})
                
                self.Nodes.append({"id":t_key,\
                              "type":t_type,\
                              "group":self.find_group(t_type),\
                              "name":t_val})
               
                    
                
                
                
    def set_edata(self):
        for item in self.Llist:            
            if item[0][0] =="E":
                t_key = item[1].split(':')[1]
                e_key = item[0]
                e_type = item[1].split(':')[0]
                e_val = self.t_dict[t_key][1]
                
                
                self.e_dict.update({e_key:[e_type,e_val]})
                
                self.node_dict.update({e_key:[e_type,e_val]})  
                
                self.Nodes.append({"id":e_key,\
                            "type":e_type,\
                            "group":self.find_group(e_type),\
                            "name":e_val})
                
                
                
    def set_rdata(self):
        for item in self.Llist:           
            if item[0][0] =="R":
                r_key = item[0]
                rln = item[1]
                fm_id = item[2].split(":")[1]
                to_id = item[3].split(":")[1]
                fm_type = self.node_dict[fm_id][0]
                to_type = self.node_dict[to_id][0]
                fm_name = self.node_dict[fm_id][1]
                to_name = self.node_dict[to_id][1]
                
                self.r_dict.update({r_key:[rln,fm_id,to_id]}) 

                self.Edges.append({"id":r_key,\
                            "name":rln,\
                            "value":self.find_edg_value(rln),\
                            "source":fm_id,\
                            "source_name":fm_name,\
                            "source_group":self.find_group(fm_type),\
                            "target":to_id,\
                            "target_name":to_name,\
                            "target_group":self.find_group(to_type)})
                
    
                
    def data_dumper(self,file_name,data):
            with open(file_name, 'w') as fp:
                json.dump(data, fp)
                
    
     
    def get_nodes(self,dump=False):
        self.line_to_list()
        self.set_tdata()
        self.set_edata()
        if dump:
            self.data_dumper(file_name = 'nodes.json',data = self.Nodes)
        return self.Nodes
    
    
    
    def get_edges(self, dump=False):
        self.set_rdata()
        if dump:
            self.data_dumper(file_name = 'edges.json',data = self.Edges)
        return self.Edges
    
    
    def get_data(self,dump=False):
        all_data = {}
        nodes = self.get_nodes()
        edges = self.get_edges()
        
        all_data.update({'nodes':nodes,'links':edges })
        
        
        
        if dump:
            self.data_dumper(file_name = 'data.json',data = all_data) 
        
        return all_data

In [29]:
DR = DataReader(ann_files[2])

In [30]:
DR.get_data(dump=True)

{'links': [{'id': 'R1',
   'name': 'MODIFY',
   'source': 'T10',
   'source_group': 7,
   'source_name': ' abdominal',
   'target': 'E5',
   'target_group': 11,
   'target_name': ' pain',
   'value': 4},
  {'id': 'R2',
   'name': 'MODIFY',
   'source': 'T12',
   'source_group': 7,
   'source_name': ' stomal',
   'target': 'E7',
   'target_group': 11,
   'target_name': ' diarrhoea',
   'value': 4},
  {'id': 'R3',
   'name': 'MODIFY',
   'source': 'E2',
   'source_group': 35,
   'source_name': ' 4 year',
   'target': 'E1',
   'target_group': 34,
   'target_name': " Crohn's disease",
   'value': 4},
  {'id': 'R4',
   'name': 'MODIFY',
   'source': 'E8',
   'source_group': 35,
   'source_name': ' two day',
   'target': 'E3',
   'target_group': 11,
   'target_name': ' malaise',
   'value': 4},
  {'id': 'R5',
   'name': 'MODIFY',
   'source': 'E8',
   'source_group': 35,
   'source_name': ' two day',
   'target': 'E4',
   'target_group': 11,
   'target_name': ' fever',
   'value': 4},
  {'id