In [25]:
import os
import sys
import numpy as np
import scipy.sparse as sp;
import torch

#Timing functions for standard stats output
# from proc.utils.timing import tic, toc;
# from proc.utils.printlog import printlog, start_log, stop_log;

def randomwalk_propagate(vector, source_indices, target_indices, weights, c = 0.01, max_cycles = 100, normalize_input = True, normalize_output = True, normalize_connections = True, eps = 1.0e-6):
    """
    Random Walk based propagation algorithm. 

    Parameters
    ----------
    vector : 1D float array or convertable
        Starting values for nodes.
    
    source_indices : 1D int array or convertable
        Indices of source nodes for edges.
    
    target_indices : 1D int array or convertable
        Indices of target nodes for edges. Must be of the same size as source_indices.
    
    weights : 1D float array or convertable
        Weight for each edge. Must match the sizes of source_indices and target_indices
    
    c : float (0.0-1.0), default - 0.01
        Restart rate constant for random walk. 0.0 means no returns to the start. 1.0 means no propagation. The smaller c - the further signal propagates.
    
    max_cycles : int, default - 100
        Limit maximum number of cycles
    
    normalize_input : bool
        Normalize input vector to a sum of 1.0. Default: True
        
    normalize_output : bool
        Normalize output vector to a sum of 1.0. Default: True
        
    normalize_connections : bool
        Normalize weights of the outgoing connections to a sum of 1.0 to reflect transition probability. Default: True
        
    eps : float
        Tolerance for the convergence of the algorithm, default: 1.0e-6.



    Returns
    -------
    tuple of 1D numpy float64 array containing propagated vector values,
    number of cycles passed and last eps between cycles.

    """
    
    #Assure correct type and convert if needed
    p_start = np.array(vector, dtype = np.float64);
    source_indices = np.array(source_indices, dtype = np.uint32);
    target_indices = np.array(target_indices, dtype = np.uint32);
    weights = np.array(weights, dtype = np.float64);
    
    #Check that array sizes match
    assert(len(source_indices) == len(target_indices))
    assert(len(source_indices) == len(weights))
    
    #Check indexing is within range
    assert(np.max(source_indices) < len(vector));
    assert(np.max(target_indices) < len(vector));

    sources = {};
    for i in range(len(weights)):
        target = sources.setdefault(source_indices[i], {'indices' : [], 'weights' : []});
        sindices = target['indices'];
        sweights = target['weights'];
        sindices.append(target_indices[i]);
        sweights.append(weights[i]);
        
    for source in sources.keys():
        sources[source]['indices'] = np.array(sources[source]['indices']);
        sweights = np.array(sources[source]['weights']);
    
    if normalize_connections:
        for source in sources.keys():
            sweights = sources[source]['weights'];
            sources[source]['weights'] = sweights / np.sum(sweights);
        
    if normalize_input:
        p_start = p_start / np.sum(p_start);

    p_prev = np.zeros(p_start.shape, dtype = np.float64);
    p_current = np.copy(p_start);
    
    eval_eps = np.max(np.abs(np.subtract(p_prev, p_current)));
    
    n_cycle = 0;
    
    while ((eval_eps > eps) and (n_cycle < max_cycles)):
        p_prev[:] = p_current[:];
        p_current[:] = 0.0;
        
        for source in sources.keys():
            target = sources[source];
            p_current[target['indices']] += p_prev[source] * target['weights'];
        
        p_current = p_current * (1.0 - c) + c * p_start;
        
        eval_eps = np.max(np.abs(np.subtract(p_prev, p_current)));
        
        n_cycle += 1;
        
    # if n_cycle == max_cycles:
        # printlog('Warning! Random walk propagation did not converge!');

        
    if normalize_output:
        p_current = p_current / np.sum(p_current);
        
    return p_current, n_cycle, eval_eps;
    


def randomwalk_sparse_matrix(vector, source_indices, target_indices, weights, c = 0.01, max_cycles = 100, normalize_input = True, normalize_output = True, normalize_connections = True, eps = 1.0e-6):
    """
    Random Walk based propagation algorithm. 

    Parameters
    ----------
    vector : 2D or 1D float array or convertable: (N_samples, N_nodes) or (N_nodes)
        Starting values for nodes.
    
    source_indices : 1D int array or convertable
        Indices of source nodes for edges.
    
    target_indices : 1D int array or convertable
        Indices of target nodes for edges. Must be of the same size as source_indices.
    
    weights : 1D float array or convertable
        Weight for each edge. Must match the sizes of source_indices and target_indices
    
    c : float (0.0-1.0), default - 0.01
        Restart rate constant for random walk. 0.0 means no returns to the start. 1.0 means no propagation. The smaller c - the further signal propagates.
    
    max_cycles : int, default - 100
        Limit maximum number of cycles
    
    normalize_input : bool
        Normalize input vector to a sum of 1.0. Default: True
        
    normalize_output : bool
        Normalize output vector to a sum of 1.0. Default: True
        
    normalize_connections : bool
        Normalize weights of the outgoing connections to a sum of 1.0 to reflect transition probability. Default: True
        
    eps : float 
        Tolerance for the convergence of the algorithm, default: 1.0e-6.



    Returns
    -------
    tuple of 1D numpy float64 array containing propagated vector(s) values,
    number of cycles passed and last eps between cycles. eps is either a float or
    1D array of floats depending upon the input vector

    """
    
    #Assure correct type and convert if needed
    p_start = np.array(vector, dtype = np.float64);
    
    if len(p_start.shape) != 1 and len(p_start.shape) != 2:
        raise ValueError('Input vector should be 1D (N_nodes) or 2D (N_samples, N_nodes) array!');
    
    if len(p_start.shape) == 1:
        p_start = p_start.reshape((1, -1));
        
    source_indices = np.array(source_indices, dtype = np.uint32);
    target_indices = np.array(target_indices, dtype = np.uint32);
    weights = np.array(weights, dtype = np.float64);
    
    #Check that array sizes match
    assert(len(source_indices) == len(target_indices))
    assert(len(source_indices) == len(weights))
    
    #Check indexing is within range
    assert(np.max(source_indices) < p_start.shape[1]);
    assert(np.max(target_indices) < p_start.shape[1]);

    #Normalize the input vector to a sum of 1.0    
    if normalize_input:
        p_start = np.divide(p_start, np.sum(p_start, axis = 1).reshape(-1, 1));

    #p_prev will hold vector values from a previous cycle
    p_prev = np.zeros(p_start.shape, dtype = np.float64);
    
    #p_current will hold vector values from a current cycle    
    p_current = np.copy(p_start);
    
    #Calculate discrepancy eps between vectors from two cycles
    eval_epses = np.max(np.abs(np.subtract(p_prev, p_current)), axis = 1);
    eval_eps = np.max(eval_epses);
    
    n_cycle = 0;
    
    #Adjucency sparse matrix is generated from input vectors as coo_matrix for fast creation
    A = sp.coo_matrix((weights, (source_indices, target_indices)), shape = (p_start.shape[1], p_start.shape[1]), dtype = np.float64);
    
    #Convert adjucency matrix to compressed sparse column format for fast operations. It is a bit faster than compressed sparse row one in our tests, but can be interchanged.
    A = sp.csc_matrix(A);
    
    #Normalize matrix outgoing weights, so that they sum to 1.0 to reflect probability of transition.
    if normalize_connections:
        outgoing_weights_sum = A.sum(axis = 1);
        #Correct for zero sums where there are no outgoing connections for the node
        outgoing_weights_sum[outgoing_weights_sum == 0.0] = 1.0;
        #Normalize the matrix via dot product with a diagonal matrix of scaling coefficients
        outgoing_weights_sum = 1.0 / outgoing_weights_sum;
        A = sp.diags([outgoing_weights_sum.flatten()], offsets = [0], shape = A.shape).dot(A);
    
    #Normally the cycle would use new_vector = dot(current_vector, A) * (1.0 - c) + c * starting_vector 
    #operation to calculate transition probabilities,
    #but since numpy does not work well with sparse matrices, it has to be a work-around:
    #dot(vector, A) = A.T.dot(vector.T).T
    #So we will pre-transpose the matrix
    A = A.transpose();
    
    #Continue cycling unless the error is less than tolerance or the number of cycles is exceeded
    while ((eval_eps > eps) and (n_cycle < max_cycles)):
        #copy current vector into previous one without memory re-allocation
        p_prev[:] = p_current[:];
        #calculate new current vector
        p_current = np.add(A.dot(p_prev.T).T * (1.0 - c), c * p_start);        
        #Re-calculate error
        eval_epses = np.max(np.abs(np.subtract(p_prev, p_current)), axis = 1);
        eval_eps = np.max(eval_epses);
        #advance cycle counter
        n_cycle += 1;

    # if n_cycle == max_cycles:
        # printlog('Warning! Random walk propagation did not converge!');
    
    #Normalize final vector to a sum of 1.0
    if normalize_output:
        p_current = np.divide(p_current, np.sum(p_current, axis = 1).reshape(-1, 1));
    
    if p_current.shape[0] == 1:
        return p_current.flatten(), n_cycle, eval_eps;
    else:
        return p_current, n_cycle, eval_epses;

In [26]:
import pandas as pd
df1 = pd.read_csv(r'df.csv')

drug = pd.read_csv('matrix_drug_targets_high_conf.csv')

drug = drug.iloc[: , 1:]
drug = drug.T



In [27]:

import pandas as pd
df = pd.read_csv(r'gene_connections.csv')
df_sort=df.sort_values(["STRING"],ascending=False)
gene_with_high_score=df_sort.loc[df_sort['STRING']>=900]
import numpy as np
import pandas as pd
gene1=np.array(gene_with_high_score["Gene1"])
gene2=np.array(gene_with_high_score["Gene2"])
weights = np.ones((547620,), dtype=int)



#Match food molecules with the location of target genes.
import pandas as pd

df1 = pd.read_csv(r'compounds_full.csv')
df1['prim'] = df1['Primary_ID'].apply(lambda x:x[:3]) 
df1 = df1[df1['prim']=='FDB'] 
df2 = pd.read_csv(r'AllSTITCH_multi.csv')
df_1 = df1[df1['InChIKey'].isin(df2['InChIKey'])] 
df_2 = df2[df2['InChIKey'].isin(df1['InChIKey'])] 
df = pd.merge(df_1,df_2,on='InChIKey').drop(['prim'],axis=1).set_index('Index') 
#Select out food molecules with the location of targets. These food molecules all have high score (>=700).
import numpy as np
import pandas as pd

food_molecules = df
def to_vector(food_molecules):
    series = pd.Series([0 for i in range(20256)])
    for i in food_molecules:
        try:
            series[int(i)]  = 1
        except:
            pass
    return np.array(series.values)

def split_vector(food_molecules):
    food_molecules_with_high_score = []
    for i in str(food_molecules).split('|'):
        try:
            if int(i.split(':')[1])>=700:
                food_molecules_with_high_score.append(i.split(':')[0])
        except:
                food_molecules_with_high_score.append(i.split(':')[0])
    return food_molecules_with_high_score

food_molecules['Genes_index'] = food_molecules['Genes'].apply(split_vector)

food_molecules['Genes_index'] = food_molecules['Genes_index'].apply(to_vector)
food_molecules['Genes_index']

loc=food_molecules['Genes_index']
loc.index=range(len(loc))
food_loc = pd.DataFrame([list(i) for i in loc]).fillna(int(0)).astype(float)
food_loc

foodmol_name=df["Primary_ID"]
foodmol_name.index=range(len(foodmol_name))
highscore_foodloc = pd.concat([foodmol_name,food_loc], axis=1,ignore_index=True)
highscore_foodloc=highscore_foodloc.transpose()
new_header = highscore_foodloc. iloc[0] #grab the first row for the header
highscore_foodloc = highscore_foodloc[1:] #take the data less the header row
highscore_foodloc. columns = new_header #set the header row as the df header
highscore_foodloc.index=range(len(highscore_foodloc))


foodloc_listscreen = highscore_foodloc.apply(lambda x:np.sum(x))
foodloc_listscreen = foodloc_listscreen[foodloc_listscreen>0]
foodmol_screen = highscore_foodloc[foodloc_listscreen.index].transpose()
foodmol_screen['Foodmol_name'] = foodmol_screen.index
foodmol_target = foodmol_screen.set_index('Foodmol_name')

foodmol_target1 = foodmol_target.to_numpy()
output2 = randomwalk_sparse_matrix(foodmol_target1, gene1, gene2, weights, 
                         c = 0.3, max_cycles = 100, 
                         normalize_input = True, normalize_output = True, 
                         normalize_connections = True, eps = 1.0e-6)

p_current2 = pd.DataFrame(output2[0]).transpose()

In [259]:
drug

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20246,20247,20248,20249,20250,20251,20252,20253,20254,20255
ABT-888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZD1775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BEZ-235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DINACICLIB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GELDANAMYCIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L778123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MK-2206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MK-4827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MK-5108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MK-8669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [257]:
drug1 = drug.to_numpy()
output3 = randomwalk_sparse_matrix(drug1, gene1, gene2, weights, 
                         c = 0.3, max_cycles = 100, 
                         normalize_input = True, normalize_output = True, 
                         normalize_connections = True, eps = 1.0e-6)

p_current3 = pd.DataFrame(output3[0]).transpose()

p_current3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,9.854590e-05,5.930287e-06,0.000092,0.000020,0.000113,8.120768e-06,0.000350,0.000025,0.000007,0.000056,...,0.000121,0.000073,0.000080,0.000068,0.001225,0.000359,1.063829e-04,0.000259,3.113974e-05,0.000116
1,1.905194e-05,7.479770e-06,0.000076,0.000021,0.000071,6.194754e-06,0.000114,0.000011,0.000007,0.000025,...,0.000048,0.000175,0.000057,0.000018,0.000044,0.000078,2.796580e-05,0.000145,3.863210e-05,0.000038
2,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
3,5.912831e-05,6.967789e-05,0.000053,0.000053,0.000380,5.765467e-05,0.000053,0.000055,0.000025,0.000146,...,0.000044,0.000054,0.000037,0.000050,0.000064,0.000070,3.579380e-05,0.000074,6.310125e-05,0.000029
4,6.664396e-07,6.444709e-07,0.000007,0.000001,0.000002,2.893963e-07,0.000010,0.000001,0.000001,0.000002,...,0.000003,0.000002,0.000001,0.000003,0.000001,0.000002,4.504396e-07,0.000002,7.138936e-07,0.000002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20251,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
20252,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
20253,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
20254,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000


In [28]:
p_current2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3306,3307,3308,3309,3310,3311,3312,3313,3314,3315
0,1.370238e-05,4.802517e-05,1.370238e-05,0.0,1.845597e-03,4.991442e-05,0.000005,3.786381e-04,0.000742,0.000606,...,0.000009,2.070063e-04,9.490490e-06,1.132411e-05,0.000064,9.621332e-06,1.608315e-05,3.890777e-06,0.000152,0.0
1,7.744300e-06,1.633276e-05,7.744300e-06,0.0,8.604565e-06,2.229942e-05,0.001013,5.476734e-05,0.000060,0.000076,...,0.000007,2.564880e-06,1.530068e-05,1.079084e-05,0.000032,2.261673e-05,9.391991e-05,9.797322e-05,0.000014,0.0
2,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0
3,8.367087e-06,1.506542e-05,8.367087e-06,0.0,9.299019e-06,1.554092e-05,0.000022,9.515720e-06,0.000054,0.000377,...,0.000016,1.775075e-05,7.851113e-05,6.762714e-05,0.000064,1.098514e-04,9.908927e-05,5.226859e-06,0.000051,0.0
4,2.222578e-07,3.236692e-07,2.222578e-07,0.0,4.441312e-08,8.144690e-08,0.000002,6.481903e-08,0.000004,0.000017,...,0.000011,8.841498e-08,1.970614e-07,1.720319e-07,0.000003,2.937617e-07,7.911728e-07,2.118390e-07,0.000001,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20251,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0
20252,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0
20253,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0
20254,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.0


In [320]:
top = pd.read_csv(r'top112genefd.csv',index_col = 0)

top['food'].value_counts()[0:12]

Urolithin A                          3
N-Chloroacetyl-2,6-diethylaniline    3
3-Hydroxyquinine                     3
Hyodeoxycholic acid                  3
Murocholic acid                      3
Liensinine                           3
(S)-[10]-Gingerol                    3
Curdione                             3
Tauro-beta-muricholic acid           3
Asiminacin                           2
(-)-vestitol                         2
3-Methylcyclopentadecanone           2
Name: food, dtype: int64

In [319]:
top1 = pd.read_csv(r'top112functionfd.csv',index_col = 0)
top1['food'].value_counts()[0:10]

ent-Gallocatechin 3-gallate       5
Epigallocatechin 3-gallate        5
ent-Epigallocatechin 3-gallate    5
Gallocatechin 3-gallate           5
Quercetin                         4
Capsaicin                         4
Genistein                         4
Simvastatin                       3
Progesterone                      3
Platinum                          2
Name: food, dtype: int64

In [31]:
top2 = pd.read_csv(r'dfsc245gene.csv',index_col = 0)
top2['food'].value_counts()

5-Phenylvaleric acid           7
Calcium carbonate              5
6-Dimethylaminopurine          5
3,5-Diiodothyronine            5
L-Homoserine                   5
                              ..
Captafol                       2
Methanethiol                   2
3-Methylhistamine              2
N-Methyl-1-deoxynojirimycin    1
4-Methylpentanoic acid         1
Name: food, Length: 89, dtype: int64

In [331]:
top3 = pd.read_csv(r'dfsc245function.csv',index_col = 0)
top3 ['drug'].value_counts()

SUNITINIB           12
DOXORUBICIN         12
ETOPOSIDE            9
GEMCITABINE          8
BORTEZOMIB           8
LAPATINIB            7
METFORMIN            7
GELDANAMYCIN         7
ERLOTINIB            7
ZOLINZA              7
L778123              6
PD325901             6
MK-4827              6
BEZ-235              6
MRK-003              6
DASATINIB            6
TEMOZOLOMIDE         6
MK-8776              6
VINBLASTINE          6
PACLITAXEL           6
SORAFENIB            6
DINACICLIB           6
TOPOTECAN            6
MK-8669              6
CYCLOPHOSPHAMIDE     6
MK-2206              6
5-FU                 6
METHOTREXATE         6
MITOMYCINE           6
AZD1775              6
MK-5108              6
VINORELBINE          6
OXALIPLATIN          6
ABT-888              6
SN-38                6
CARBOPLATIN          6
DEXAMETHASONE        5
Name: drug, dtype: int64

In [313]:
df1[df1.Name =='ent-Gallocatechin 3-gallate'].iat[0,0]
#Urolithin A    FDB02999                                       3
#N-Chloroacetyl-2,6-diethylaniline   FDB010832FDB010832                    3
#3-Hydroxyquinine                 FDB022421FDB022421                       3
#Hyodeoxycholic acid      FDB022211FDB022211                               3
#Murocholic acid FDB022259FDB022259

'FDB000095'

In [180]:
df1

Unnamed: 0,Primary_ID,Label,Name,InChIKey,prim
1891,FDB017141,?,"Proanthocyanidin B2 3,3'-digallate",KTLUHRSHFRODPS-RIQPQZJCSA-N,FDB
1892,FDB017417,?,Davanone,FJKKZNIYYVEYOL-UHFFFAOYSA-N,FDB
1893,FDB000803,?,Phenylmethanethiol,UENWRTRMUIOCKN-UHFFFAOYSA-N,FDB
1894,FDB001286,?,Glabrin A,VORPREYJNTUAGI-UHFFFAOYSA-N,FDB
1895,FDB030789,?,"D-myo-inositol (2,3,4) trisphosphate",GKDKOMAJZATYAY-UHFFFAOYSA-H,FDB
...,...,...,...,...,...
9848,FDB030478,?,4-(2'-carboxyphenyl)-4-oxobutyryl-CoA,KVAQAPQXOXTRAE-UHFFFAOYSA-I,FDB
9849,FDB016763,?,(E)-11-Hexadecenoic acid,JGMYDQCXGIMHLL-AATRIKPKSA-N,FDB
9850,FDB016124,?,Piperonyl isobutyrate,RQULTIASPCVEFO-UHFFFAOYSA-N,FDB
9851,FDB019935,?,Isoamyl butyrate,PQLMXFQTAMDXIZ-UHFFFAOYSA-N,FDB


In [215]:
foodmol_target.loc[['FDB012370','FDB029999','FDB000095']]
#selenomethionine  codein 'Arsenite','Ammonium chloride ((NH4)Cl)'L-Menthyl 1,2-propylene glycol carbonate

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20246,20247,20248,20249,20250,20251,20252,20253,20254,20255
Foodmol_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FDB012370,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FDB029999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FDB000095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2

In [321]:
foodmol_target1 = foodmol_target.loc[[ 'FDB029999','FDB000095']].to_numpy()
output2 = randomwalk_sparse_matrix(foodmol_target1, gene1, gene2, weights, 
                         c =0.0000085, max_cycles = 100, 
                         normalize_input = True, normalize_output = True, 
                         normalize_connections = True, eps = 1.0e-6)
p_current2 = pd.DataFrame(output2[0]).transpose()

In [322]:
p_current2 

Unnamed: 0,0,1
0,0.000042,0.000045
1,0.000057,0.000058
2,0.000000,0.000000
3,0.000084,0.000083
4,0.000016,0.000016
...,...,...
20251,0.000000,0.000000
20252,0.000000,0.000000
20253,0.000000,0.000000
20254,0.000000,0.000000


In [226]:
genename = pd.read_csv(r'genes.csv',index_col = 0)

In [187]:
genename

Unnamed: 0_level_0,UniProt,Name,Ensembl_G,Ensembl_P,Ensembl_T,Full_name_from_nomenclature_authority,GSEA,Gene_ID,HGNC,Symbol_from_nomenclature_authority,Type_of_gene,BioGene_Confidence,Description,Sequence
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Q6NR85|SODC_HUMAN,SOD1,ENSG00000142168,ENSP00000270142,ENST00000270142,,SOD1,6647,11179|SOD1,,,3.0,,MATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGF...
1,PFKAM_HUMAN|Q16815|Q6ZTT1|P08237-2|J3KNX3|Q168...,PFKM,,ENSP00000448177|ENSP00000345771,ENST00000551804,,PFKM,5213,PFKM|8877,,,3.0,,MTHEEHHAAKTLGIGKAIAVLTSGGDAQGMNAAVRAVVRVGIFTGA...
2,MEGF8_HUMAN,MEGF8,,ENSP00000334219,,,MEGF8,,,,,0.0,,MALGKVLAMALVLALAVLGSLSPGARAGDCKGQRQVLREAPGFVTD...
3,Q6NT75|Q96CY0|Q9UBN7-1|O94975|Q9UBN7|HDAC6_HUM...,PPP1R90|CPBHM|JM21|HDAC6|HD6,ENSG00000094631,ENSP00000365804|-|ENSP00000334061|ENSP00000392815,ENST00000334136|ENST00000376619|ENST0000042394...,histone deacetylase 6,HDAC6,10013,14064|HDAC6,HDAC6,protein-coding,3.0,histone deacetylase 6,MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN...
4,Q9Y450-1|Q4VX90|Q5T7G3|B7Z365|Q9Y450|Q4VX89|HB...,ERFS|eRF3c|HBS1|HSPC276|EF-1a|HBS1L,ENSG00000112339,ENSP00000356811|ENSP00000356796|ENSP00000356800,ENST00000367826|ENST00000367837|ENST00000367822,HBS1 like translational GTPase,HBS1L,10767,4834|HBS1L,HBS1L,protein-coding,3.0,HBS1 like translational GTPase,MARHRNVRGYNYDEDFEDDDLYGQSVEDDYCISPSTAAQFIYSRRD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20251,Q07812-3|Q9UCZ6|Q9UCZ7|Q07812-2|BAX_HUMAN|Q9UQD6,BAX,,ENSP00000426328|ENSP00000293288,ENST00000515540,,BAX,581,959|BAX,,,2.0,,MDGSGEQPRGGGPTSSEQIMKTGALLLQGFIQDRAGRMGGEAPELA...
20252,SIG16_HUMAN|A6NMB1,SIGLEC16,,,,,SIGLEC16,400709,24851|SIGLEC16,,,1.0,,MLLLPLLLPVLGAGSLNKDPSYSLQVQRQVPVPEGLCVIVSCNLSY...
20253,P85298-3|Q9NSG0|RHG08_HUMAN|Q9UH20|Q9NXL1|P852...,ARHGAP8,,ENSP00000374424|ENSP00000262731,ENST00000389774,,ARHGAP8,23779,677|ARHGAP8,,,3.0,,MAGQDPALSTSHPFYDVARHGILQVAGDDRFGRRVVTFSCCRMPPS...
20254,RREB1_HUMAN|Q9Y474|Q92766-2|Q6BEP6|Q6BEP8|C9JU...,FINB|RREB-1|Zep-1|RREB1|HNT|LZ321,ENSG00000124782,ENSP00000305560|ENSP00000369270|ENSP0000036926...,ENST00000379938|ENST00000349384|ENST0000037993...,ras responsive element binding protein 1,RREB1,6239,RREB1|10449,RREB1,protein-coding,3.0,ras responsive element binding protein 1,MTSSSPAGLEGSDLSSINTMMSAVMSVGKVTENGGSPQGIKSPSKP...


In [145]:
selenomethionine = pd.DataFrame(zip(genename['GSEA'],p_current2[0]))
selenomethionine.columns =['gene','score']
selenomethionine.to_csv('selenomethionine.rnk',header=True,index=False,sep='\t')
selenomethionine.to_csv('selenomethionine.csv',header=True,index=False,sep='\t')
selenomethionine = sele.drop(sele[sele['gene'].isna()].index)
selenomethionine = sele.dropna()

In [155]:
#ent-Gallocatechin 3-gallate 

In [4]:
selenomethionine

Unnamed: 0,gene,score
0,SOD1,0.000043
1,PFKM,0.000057
2,MEGF8,0.000000
3,HDAC6,0.000084
4,HBS1L,0.000016
...,...,...
19381,BAX,0.000000
19382,SIGLEC16,0.000000
19383,ARHGAP8,0.000000
19384,RREB1,0.000000


In [129]:
pathway_score = pd.read_csv("aug13/my_analysis.GseaPreranked.1628856590773/gsea_report_for_na_pos_1628856590773.tsv",delimiter="\t") 
top_pathway = pathway_score[pathway_score["FWER p-val"]<=0.05] 

top_genes = pd.DataFrame()

for i, row in top_pathway.iterrows():
    top_pathway_name = top_pathway["NAME"][i] 
    pathway1 = pd.read_csv("aug13/my_analysis.GseaPreranked.1628856590773/" + top_pathway_name +".tsv" ,delimiter="\t") 
    pathway1 = pathway1[pathway1["CORE ENRICHMENT"]=="Yes"] 
    tempdf = selenomethionine[selenomethionine['gene'].isin(pathway1['SYMBOL'])].sort_values(by='score',ascending=False)[0:9]
    tempdf['class'] = i
    top_genes = top_genes.append(tempdf)
    
top_genes.columns = ["Label","weight",'class']
top_genes.index.name='Id'
top_genes

top_genes.to_csv('topgenes.csv')

In [170]:
PPI = pd.read_csv('gene_connections.csv')

PPI

Unnamed: 0,Index,Gene1,Gene2,STRING,BioPlex_Confidence
0,0,6114,3205,260,0
1,1,6114,3583,164,0
2,2,6114,17170,159,0
3,3,6114,1240,194,0
4,4,6114,19006,164,0
...,...,...,...,...,...
11428347,11428347,10031,13310,0,9
11428348,11428348,13310,10618,0,9
11428349,11428349,10618,13310,0,9
11428350,11428350,13310,6501,0,9


In [171]:
PPI_1 = PPI[PPI['Gene1'].isin(top_genes.index)]
PPI_selenomethionine = PPI_1[PPI_1['Gene2'].isin(top_genes.index)]
PPI_selenomethionine = PPI_selenomethionine[['Gene1','Gene2']]
PPI_selenomethionine.columns = ['Source','Target']
PPI_selenomethionine.to_csv('PPI_selenomethionine.csv',index=False)

In [323]:
UA = pd.DataFrame(zip(genename['GSEA'],p_current2[0]))
UA.columns =['gene','score']
UA = UA.dropna()
UA.to_csv('UA.rnk',header=True,index=False,sep='\t')
UA.to_csv('UA.csv',header=True,index=False,sep='\t')
UA

Unnamed: 0,gene,score
0,SOD1,0.000042
1,PFKM,0.000057
2,MEGF8,0.000000
3,HDAC6,0.000084
4,HBS1L,0.000016
...,...,...
20251,BAX,0.000000
20252,SIGLEC16,0.000000
20253,ARHGAP8,0.000000
20254,RREB1,0.000000


In [325]:
pathway_score = pd.read_csv("aug13/my_analysis.GseaPreranked.UA/gsea_report_for_na_pos_1629379219451.tsv",delimiter="\t") 
top_pathway = pathway_score[pathway_score["FDR q-val"]<=0.05] 

top_genes = pd.DataFrame()

for i, row in top_pathway.iterrows():
    top_pathway_name = top_pathway["NAME"][i] 
    pathway1 = pd.read_csv("aug13/my_analysis.GseaPreranked.UA/" + top_pathway_name +".tsv" ,delimiter="\t") 
    pathway1 = pathway1[pathway1["CORE ENRICHMENT"]=="Yes"] 
    tempdf = selenomethionine[selenomethionine['gene'].isin(pathway1['SYMBOL'])].sort_values(by='score',ascending=False)[0:9]
    tempdf['class'] = i
    top_genes = top_genes.append(tempdf)
    
top_genes.columns = ["Label","weight",'class']
top_genes.index.name='Id'
top_genes

top_genes.to_csv('topgenes_UA.csv')

In [326]:
PPI_1 = PPI[PPI['Gene1'].isin(top_genes.index)]
PPI_UA = PPI_1[PPI_1['Gene2'].isin(top_genes.index)]
PPI_UA = PPI_UA[['Gene1','Gene2']]
PPI_UA.columns = ['Source','Target']
PPI_UA.to_csv('PPI_UA.csv',index=False)

In [324]:
tea = pd.DataFrame(zip(genename['GSEA'],p_current2[1]))
tea.columns =['gene','score']
tea = tea.dropna()
tea.to_csv('tea.rnk',header=True,index=False,sep='\t')
tea.to_csv('tea.csv',header=True,index=False,sep='\t')
tea

Unnamed: 0,gene,score
0,SOD1,0.000045
1,PFKM,0.000058
2,MEGF8,0.000000
3,HDAC6,0.000083
4,HBS1L,0.000016
...,...,...
20251,BAX,0.000000
20252,SIGLEC16,0.000000
20253,ARHGAP8,0.000000
20254,RREB1,0.000000


In [329]:
pathway_score = pd.read_csv("aug13/my_analysis.GseaPreranked.tea/gsea_report_for_na_pos_1629379312067.tsv",delimiter="\t") 
top_pathway = pathway_score[pathway_score["FDR q-val"]<=0.05] 

top_genes = pd.DataFrame()

for i, row in top_pathway.iterrows():
    top_pathway_name = top_pathway["NAME"][i] 
    pathway1 = pd.read_csv("aug13/my_analysis.GseaPreranked.tea/" + top_pathway_name +".tsv" ,delimiter="\t") 
    pathway1 = pathway1[pathway1["CORE ENRICHMENT"]=="Yes"] 
    tempdf = selenomethionine[selenomethionine['gene'].isin(pathway1['SYMBOL'])].sort_values(by='score',ascending=False)[0:9]
    tempdf['class'] = i
    top_genes = top_genes.append(tempdf)
    
top_genes.columns = ["Label","weight",'class']
top_genes.index.name='Id'
top_genes

top_genes.to_csv('topgenes_tea.csv')

In [330]:
PPI_1 = PPI[PPI['Gene1'].isin(top_genes.index)]
PPI_tea = PPI_1[PPI_1['Gene2'].isin(top_genes.index)]
PPI_tea = PPI_tea[['Gene1','Gene2']]
PPI_tea.columns = ['Source','Target']
PPI_tea.to_csv('PPI_tea.csv',index=False)

In [274]:
MK2206 = pd.DataFrame(zip(genename['GSEA'],p_current3[6]))
MK2206.columns =['gene','score']
MK2206 = MK2206.dropna()
MK2206.to_csv('MK2205.rnk',header=True,index=False,sep='\t')
MK2206.to_csv('MK2206.csv',header=True,index=False,sep='\t')
MK2206

Unnamed: 0,gene,score
0,SOD1,0.000350
1,PFKM,0.000114
2,MEGF8,0.000000
3,HDAC6,0.000053
4,HBS1L,0.000010
...,...,...
20251,BAX,0.000000
20252,SIGLEC16,0.000000
20253,ARHGAP8,0.000000
20254,RREB1,0.000000


In [267]:
pathway_score = pd.read_csv("aug13/my_analysis.GseaPreranked.1629156543436/gsea_report_for_na_pos_1629156543436.tsv",delimiter="\t") 
top_pathway = pathway_score[pathway_score["FWER p-val"]<=0.05] 

top_genes = pd.DataFrame()

for i, row in top_pathway.iterrows():
    top_pathway_name = top_pathway["NAME"][i] 
    pathway1 = pd.read_csv("aug13/my_analysis.GseaPreranked.1629156543436/" + top_pathway_name +".tsv" ,delimiter="\t") 
    pathway1 = pathway1[pathway1["CORE ENRICHMENT"]=="Yes"] 
    tempdf = selenomethionine[selenomethionine['gene'].isin(pathway1['SYMBOL'])].sort_values(by='score',ascending=False)[0:9]
    tempdf['class'] = i
    top_genes = top_genes.append(tempdf)
    
top_genes.columns = ["Label","weight",'class']
top_genes.index.name='Id'
top_genes

top_genes.to_csv('topgenes_mk2206.csv')

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

In [271]:
pathway_score

Unnamed: 0,NAME,GS<br> follow link to MSigDB,GS DETAILS,SIZE,ES,NES,NOM p-val,FDR q-val,FWER p-val,RANK AT MAX,LEADING EDGE,Unnamed: 11
0,KEGG_RIBOSOME,KEGG_RIBOSOME,Details ...,83,0.960236,1.266553,0.003,0.874997,0.571,659,"tags=93%, list=4%, signal=96%",
1,KEGG_MTOR_SIGNALING_PATHWAY,KEGG_MTOR_SIGNALING_PATHWAY,Details ...,48,0.969729,1.264512,0.012,0.503998,0.628,341,"tags=65%, list=2%, signal=66%",
2,KEGG_ACUTE_MYELOID_LEUKEMIA,KEGG_ACUTE_MYELOID_LEUKEMIA,Details ...,56,0.965258,1.260110,0.009,0.423665,0.714,378,"tags=59%, list=2%, signal=60%",
3,KEGG_LINOLEIC_ACID_METABOLISM,KEGG_LINOLEIC_ACID_METABOLISM,Details ...,27,0.970783,1.258080,0.013,0.346249,0.752,565,"tags=89%, list=3%, signal=92%",
4,KEGG_DRUG_METABOLISM_OTHER_ENZYMES,KEGG_DRUG_METABOLISM_OTHER_ENZYMES,Details ...,46,0.960633,1.256658,0.013,0.295399,0.780,331,"tags=43%, list=2%, signal=44%",
...,...,...,...,...,...,...,...,...,...,...,...,...
167,KEGG_OLFACTORY_TRANSDUCTION,KEGG_OLFACTORY_TRANSDUCTION,,371,0.595294,0.778076,1.000,1.000000,1.000,1545,"tags=2%, list=8%, signal=2%",
168,KEGG_O_GLYCAN_BIOSYNTHESIS,KEGG_O_GLYCAN_BIOSYNTHESIS,,26,0.525550,0.684951,0.999,1.000000,1.000,8826,"tags=96%, list=48%, signal=183%",
169,KEGG_GLYCOSPHINGOLIPID_BIOSYNTHESIS_LACTO_AND_...,KEGG_GLYCOSPHINGOLIPID_BIOSYNTHESIS_LACTO_AND_...,,24,0.520337,0.674956,0.999,1.000000,1.000,8846,"tags=88%, list=48%, signal=167%",
170,KEGG_AMINOACYL_TRNA_BIOSYNTHESIS,KEGG_AMINOACYL_TRNA_BIOSYNTHESIS,,22,0.496928,0.642507,1.000,1.000000,1.000,9354,"tags=91%, list=50%, signal=183%",


In [266]:
mtfm = pd.DataFrame(zip(genename['GSEA'],p_current3[28]))
mtfm.columns =['gene','score']
mtfm = mtfm.dropna()
mtfm.to_csv('mtfm.rnk',header=True,index=False,sep='\t')
mtfm.to_csv('mtfm.csv',header=True,index=False,sep='\t')