In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  8 01:49:00 2019

@author: Yuguang Wang & Yanan Wang
"""

import pandas as pd
import numpy as np
import scipy.io as sio
import os
from scipy.sparse import csr_matrix

In [None]:
def create_samples(file_name):
    subdir = '_{}/'.format(file_name.split(".")[0])
    # critical distance 20 \mu m, pix width 0.5 \mu m
    critical = 20/0.5
    num_node = 100
    adj = list()
    feature = list()
    label = list()
    pid = list()
    pid_name = list()
    coor = list()
    edge_index = list()
    edge_coor = list()
    edge_attr = list()
    factor_flag = 'selected_new'
    #factor_flag = 'selected'
    #factor_flag = 'full'
    sv_dir = "data/" + factor_flag + subdir
    if not os.path.exists(sv_dir):
        os.makedirs(sv_dir)
    
    print('****** Processing data for %s ******' % file_name)
    df_diagnosis = pd.read_excel(file_name)
    id_participant = df_diagnosis['Id']
    num_id_1 = len(id_participant)
    survival_time_all = df_diagnosis['survival_time'].array
    
    for kid, row in df_diagnosis.iterrows():
        ld_csv = row['file_path']
        old_new_flag = ld_csv.split("/")[2]
        #print(ld_csv)
        #print(old_new_flag)
        
        label1 = row['prognosis_label']
        df_csv = pd.read_csv(ld_csv)
        # extract the features
        if factor_flag == 'full': # from "DAPI (DAPI) Nucleus Intensity" to end
            lv = np.linspace(22,59,38,dtype=int)
        if factor_flag == 'selected_old':
            lv = np.array([11,12,18,35,36,37,38,39],dtype=np.int)
        if(old_new_flag == 'stomach_csv_1' and factor_flag == 'selected_new'):
            lv = np.concatenate((np.linspace(20,44,25,dtype=int),np.linspace(50,59,10,dtype=int)))
        elif(old_new_flag == 'stomach_csv_2' and factor_flag == 'selected_new'):
            lv = np.concatenate((np.linspace(32,56,25,dtype=int),np.linspace(62,71,10,dtype=int)))
        feature_all = df_csv.take(lv,axis=1).values
        #print(feature_all)
        feature_all = feature_all/(feature_all.max(axis=0)+0.00000000000001) # Add normalization
        #print(feature_all)
        
        
        # compute adjacency matrix
        xmin = np.array(df_csv['XMin'])
        xmax = np.array(df_csv['XMax'])
        ymin = np.array(df_csv['YMin'])
        ymax = np.array(df_csv['YMax'])
        num_cell = len(xmin)
        if np.mod(num_cell,num_node)==0:
            num_graph = int(num_cell/num_node)
        else:
            num_graph = int(num_cell/num_node)+1
        # compute the centre coordinates of cells
        xc = (xmin+xmax)/2
        yc = (ymin+ymax)/2
        #% compute the adjacency matrix for each graph
        # deal with the graphs except the last
        for i in range(num_graph-1):
            A = np.zeros([num_node,num_node])
            coor1 = list()
            coor1.append(xc[i*num_node:(i+1)*num_node])
            coor1.append(yc[i*num_node:(i+1)*num_node])
            coor1 = np.reshape(np.array(coor1),[num_node,2])
            edge_coor_1 = list()
            edge_index_1 =list()
            edge_attr_1 = list()
            for k in range(num_node):
                for j in range(k+1,num_node):
                    # turn to global coordinates
                    k1 = i*num_node + k
                    j1 = i*num_node + j
                    dist = np.sqrt((xc[k1]-xc[j1])**2+(yc[k1]-yc[j1])**2)
                    if dist<critical:
                        A[k,j] = critical/dist
                        A[j,k] = critical/dist
                        edge_coor_temp = np.array([xc[k1],yc[k1],xc[j1],yc[j1]],dtype=np.float64)
                        edge_coor_1.append(edge_coor_temp)
                        edge_coor_temp = np.array([xc[j1],yc[j1],xc[k1],yc[k1]],dtype=np.float64)
                        edge_coor_1.append(edge_coor_temp)
                        edge_index_temp = np.array([k,j],dtype=np.int)
                        edge_index_1.append(edge_index_temp)
                        edge_index_temp = np.array([j,k],dtype=np.int)
                        edge_index_1.append(edge_index_temp)
                        edge_attr_temp = np.array([A[k,j],A[j,k]],dtype=np.float)
                        edge_attr_1.append(edge_attr_temp)
            A = csr_matrix(A)
            adj.append(A)
            #print('kid, id_participant[kid]:',kid, id_participant[kid])
            tissue_id = row['组织编码']
            #print(tissue_id)
            pid_name.append(tissue_id)
            pid.append(kid)
            label.append(label1)
            coor.append(coor1)
            edge_coor.append(np.reshape(np.array(edge_coor_1),[len(edge_coor_1),4]))
            edge_index.append(np.reshape(np.array(edge_index_1),[len(edge_index_1),2]))
            edge_attr.append(np.reshape(np.array(edge_attr_1),[len(edge_index_1),1]))
            feature.append(feature_all[i*num_node:(i+1)*num_node,])
        # deal with the last graph
        if np.mod(num_cell,num_node)>0:
            num_node_last = int(np.mod(num_cell,num_node))
        else:
            num_node_last = num_node
        coor1 = list()
        coor1.append(xc[(i+1)*num_node:])
        coor1.append(yc[(i+1)*num_node:])
        coor1 = np.reshape(np.array(coor1),[num_node_last,2])
        A = np.zeros([num_node_last,num_node_last])
        for k in range(num_node_last):
            for j in range(k,num_node_last):
                dist = np.sqrt((xc[k1]-xc[j1])**2+(yc[k1]-yc[j1])**2)
                k1 = i*num_node + k
                j1 = i*num_node + j
                if dist<critical:
                    A[k,j] = critical/dist
                    A[j,k] = critical/dist
                    edge_coor_temp = np.array([xc[k1],yc[k1],xc[j1],yc[j1]],dtype=np.float64)
                    edge_coor_1.append(edge_coor_temp)
                    edge_coor_temp = np.array([xc[j1],yc[j1],xc[k1],yc[k1]],dtype=np.float64)
                    edge_coor_1.append(edge_coor_temp)
                    edge_index_temp = np.array([k,j],dtype=np.int)
                    edge_index_1.append(edge_index_temp)
                    edge_index_temp = np.array([j,k],dtype=np.int)
                    edge_index_1.append(edge_index_temp)
                    edge_attr_temp = np.array([A[k,j],A[j,k]],dtype=np.float)
                    edge_attr_1.append(edge_attr_temp)
        A = csr_matrix(A)
    #    print('kid, id_participant[kid]:',kid, id_participant[kid])
        adj.append(A)
        
        tissue_id = row['组织编码']
        #print(tissue_id)
        pid.append(kid)
        pid_name.append(tissue_id)
        

        label.append(label1)
        coor.append(coor1)
        edge_coor.append(np.reshape(np.array(edge_coor_1),[len(edge_coor_1),4]))
        edge_index.append(np.reshape(np.array(edge_index_1),[len(edge_index_1),2]))
        edge_attr.append(np.reshape(np.array(edge_attr_1),[len(edge_index_1),1]))
        feature.append(feature_all[i*num_node:(i+1)*num_node,])
        
    #%% save data
    # adj
    sv_adj = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_adj_csr' + '.mat'
    sio.savemat(sv_adj,mdict={'adj':adj})
    # feature
    sv_feature = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_feature' + '.mat'
    sio.savemat(sv_feature,mdict={'feature':feature})
    # label in survival time classes
    sv_label = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_label' + '.mat'
    sio.savemat(sv_label,mdict={'label':label})
    # label in participant id
    sv_pid = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_pid' + '.mat'
    sio.savemat(sv_pid,mdict={'pid':pid})
    # label in participant id name
    sv_pid_name = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_pid_name' + '.mat'
    sio.savemat(sv_pid_name,mdict={'pid_name':pid_name})
    # coordinates of nodes
    sv_node_coor = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_nodecoor' + '.mat'
    sio.savemat(sv_node_coor,mdict={'coor':coor})
    # index of the two ends of edges
    sv_edge = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_edge_index' + '.mat'
    sio.savemat(sv_edge,mdict={'edge_index':edge_index})
    # weight on each edge
    sv_edge_attr = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_edge_attr' + '.mat'
    sio.savemat(sv_edge_attr,mdict={'edge_attr':edge_attr})
    # coordinates of the two ends of edges
    sv_edge_coor = sv_dir + 'graph' + str(len(label)) + '_node' + str(num_node) + '_weighted' + '_edge_coor' + '.mat'
    sio.savemat(sv_edge_coor,mdict={'edge_coor':edge_coor})
        
        
        
        
        
#create_samples("data_file/test_data.xlsx")
create_samples("data_file/test_data_surv.xlsx")

# for i in [0,1,2,3,4]:
#     create_samples("data_file/val_data_fold_{}.xlsx".format(i))
#     create_samples("data_file/train_data_fold_{}.xlsx".format(i))

    
    
    
    

****** Processing data for data_file/test_data_surv.xlsx ******


