In [13]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import matplotlib.pyplot as plt
from torch_geometric.data import Data
from torch_geometric.utils.convert import to_networkx
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

data_dir='/data/NK/'

In [14]:
data_adj=pd.read_csv(os.path.join(data_dir, f'ppi_of_NK.csv'),sep=",") 
signaling=pd.read_csv(os.path.join(data_dir, f'signalingLayer_of_NK.csv'),sep=",",index_col='gene_id') 
data_adj.head(10)

Unnamed: 0,1,368,8754,5290,3172,3164,204,14,847,183,...,375611,90139,9331,283358,8708,9227,124961,2529,117156,5251
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
368,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8754,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5290,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3172,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3164,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
204,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
ed=sp.coo_matrix(data_adj) #Convert the adjacency matrix into a sparse matrix of the coo form
indices=np.vstack((ed.row,ed.col))
index=torch.LongTensor(indices)
values=torch.FloatTensor(ed.data)
edge_index=torch.sparse_coo_tensor(index,values,ed.shape)
edge_index=edge_index.coalesce().indices() #Extract indices
edge_index

tensor([[   0,    0,    0,  ..., 8508, 8509, 8510],
        [   1,    2,    3,  ..., 3769, 8474, 6051]])

In [16]:
information_of_cells=pd.read_csv(os.path.join(data_dir, f'information_of_NK.csv'),sep=",")#
information_of_cells.head(10)

Unnamed: 0,donor_id,age,sex,cell_type,orig.ident
meta100,689_690,59,male,natural killer cell,onek1k
meta101,689_690,59,male,natural killer cell,onek1k
meta102,689_690,59,male,natural killer cell,onek1k
meta103,689_690,59,male,natural killer cell,onek1k
meta104,689_690,59,male,natural killer cell,onek1k
meta105,689_690,59,male,natural killer cell,onek1k
meta106,689_690,59,male,natural killer cell,onek1k
meta107,689_690,59,male,natural killer cell,onek1k
meta108,689_690,59,male,natural killer cell,onek1k
meta109,689_690,59,male,natural killer cell,onek1k


In [17]:
def read_single_csv(input_path):
    df_chunk=pd.read_csv(input_path,sep=",",chunksize=3000)  #The hunksize parameter enables batch reads (this parameter is used to set how many rows of data are read into each batch)
    res_chunk=[]
    for chunk in df_chunk:
        res_chunk.append(chunk)
    res_df=pd.concat(res_chunk)
    return res_df

In [18]:
Log_normalized_matrix_of_naive_cd4=read_single_csv(os.path.join(data_dir, f'expression_of_NK.csv'))
Log_normalized_matrix_of_naive_cd4.head(5)

Unnamed: 0,1,368,8754,5290,3172,3164,204,14,847,183,...,375611,90139,9331,283358,8708,9227,124961,2529,117156,5251
meta100,0.0,0.0,0.0,0.0,0.0,0.0,0.096977,0.204885,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta101,0.190978,0.0,0.0,0.206233,0.0,0.101229,0.188599,0.297983,0.101229,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta102,0.10591,0.0,0.0,0.0,0.0,0.087403,0.096477,0.206901,0.088311,0.0,...,0.0,0.0,0.103732,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta103,0.190978,0.0,0.0,0.0,0.0,0.060198,0.112436,0.302726,0.088311,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta104,0.090321,0.0,0.0,0.0,0.0,0.0,0.133303,0.195215,0.247647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
transfer = MinMaxScaler(feature_range=(0, 1))
data_i = transfer.fit_transform(Log_normalized_matrix_of_naive_cd4)
Log_normalized_matrix_of_naive_cd4=pd.DataFrame(data_i,index=Log_normalized_matrix_of_naive_cd4.index,columns=Log_normalized_matrix_of_naive_cd4.columns)
Log_normalized_matrix_of_naive_cd4.head(5)

Unnamed: 0,1,368,8754,5290,3172,3164,204,14,847,183,...,375611,90139,9331,283358,8708,9227,124961,2529,117156,5251
meta100,0.0,0.0,0.0,0.0,0.0,0.0,0.140142,0.233978,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta101,0.294112,0.0,0.0,0.356121,0.0,0.262787,0.272547,0.340296,0.138441,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta102,0.163104,0.0,0.0,0.0,0.0,0.226895,0.139419,0.236281,0.120774,0.0,...,0.0,0.0,0.346153,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta103,0.294112,0.0,0.0,0.0,0.0,0.156272,0.162482,0.345712,0.120774,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta104,0.139097,0.0,0.0,0.0,0.0,0.0,0.192637,0.222935,0.338683,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
len(Log_normalized_matrix_of_naive_cd4)

11452

In [21]:
cells=pd.concat([Log_normalized_matrix_of_naive_cd4,information_of_cells.iloc[:,1]],axis=1)
cells.head(5)

Unnamed: 0,1,368,8754,5290,3172,3164,204,14,847,183,...,90139,9331,283358,8708,9227,124961,2529,117156,5251,age
meta100,0.0,0.0,0.0,0.0,0.0,0.0,0.140142,0.233978,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59
meta101,0.294112,0.0,0.0,0.356121,0.0,0.262787,0.272547,0.340296,0.138441,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59
meta102,0.163104,0.0,0.0,0.0,0.0,0.226895,0.139419,0.236281,0.120774,0.0,...,0.0,0.346153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59
meta103,0.294112,0.0,0.0,0.0,0.0,0.156272,0.162482,0.345712,0.120774,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59
meta104,0.139097,0.0,0.0,0.0,0.0,0.0,0.192637,0.222935,0.338683,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59


In [22]:
cells.describe()

Unnamed: 0,1,368,8754,5290,3172,3164,204,14,847,183,...,90139,9331,283358,8708,9227,124961,2529,117156,5251,age
count,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,...,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0,11452.0
mean,0.100811,0.001269,0.030494,0.098143,0.001499,0.060871,0.180552,0.181779,0.164394,0.000742,...,0.008377,0.046287,0.011506,0.000165,0.00393,0.014115,0.049619,0.000606,0.00191,66.457475
std,0.131556,0.031028,0.106064,0.138048,0.033351,0.133637,0.165444,0.149131,0.154798,0.025336,...,0.062114,0.131219,0.059529,0.012521,0.051987,0.077618,0.102299,0.022086,0.037966,15.866244
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086913,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.161024,0.150697,0.146433,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0
75%,0.175879,0.0,0.0,0.187482,0.0,0.0,0.28487,0.273721,0.26408,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,97.0


In [23]:
data_list=[]
age_list=[]
#type(Data) = <class 'torch_geometric.data.data.Data'>
#Iterate over each row
for index, row in cells.iterrows(): 
    data=pd.DataFrame(row).iloc[:-1,:]
    matrix=data.to_numpy()
    x=torch.tensor(matrix,dtype=torch.float)
    y= torch.as_tensor(row.iloc[-1]).type(torch.int64)
    data_pyg=Data(x=x,y=y,edge_index=edge_index)#Convert to pyg data format
    data_list.append(data_pyg)


In [24]:
torch.save(data_list,'NK_pyg.pt') #Save the processed data file