In [6]:
import pandas as pd
import numpy as np
from src.data_collection import deg_utils, stringdb_api
from src.analysis import clustering_utils
import networkx as nx
from src.graph.graph_util import *
from src.graph import RWR
import config.config as config
from src.visualization.plots import plot_dist
from cdlib import algorithms
from IPython.display import Image, display
import copy

import dill
#dill.load_session("03_session.pkl")

# Diffusion analysis

Is it possible to study the effects of the up and down regulated genes over the whole network?

Would this provide meaningful insights? How should the analysis be structured?

**IPOTESI DA TESTARE**: cercare network motifs interessanti come positive/negative feedback loops e bifans nel directed graph una volta buildato. 
Se ad esempio individuo un grande feedback loop, posso farne il functional enrichment e vedere se è legato a una specifica funzione, oltre che a generare ipotesi sulla possibile evoluzione dati i livelli di fold change e propagazione.

To address this, i am going to build a directed graph, including activation/inhibition informations over all the edges and then propagate the fold changes of the seed genes into the network.

https://pmc.ncbi.nlm.nih.gov/articles/PMC8664198/

One possible application of network propagation is the imputation of missing values [2]. For example, shotgun proteomics measurements often do not quantify all proteins in a sample. One might use network propagation to impute those missing values by utilizing measured protein levels of neighboring proteins in the network. In order to test this idea and in order to further validate the plausibility of network propagation results, we imputed expression fold changes (young versus old) for missing proteins and mRNAs with the goal to recover known ageing-associated proteins and transcripts.

To have a gold-standard i can test this hypotesis by keeping the hub nodes for DEG network of old rats used as control, then propagate the fold changes and check wether it matches the ground truth.

In [7]:
ground_truth_fold_change = deg_utils.preprocess_bulk_rna_seq_data(0.05, 0.585, contrast=["group", "Aging", "Young"], file=config.young_vs_aging_PROCESSED_BULK_RNA_SEQ_FILE)
ground_truth_fold_change

Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,Ctbs,179.670618,1.068018,0.309369,3.452250,5.559331e-04,4.851144e-03
8,Jak3,276.376494,1.009759,0.359391,2.809635,4.959775e-03,2.603925e-02
11,Atp6v0d2,285.854525,3.264207,0.996693,3.275038,1.056479e-03,8.057966e-03
12,Cdk10,180.061851,-0.949429,0.298157,-3.184330,1.450894e-03,1.024216e-02
14,Fgg,69476.512553,1.388223,0.403716,3.438610,5.847094e-04,5.044564e-03
...,...,...,...,...,...,...,...
15579,Ggnbp1,52.531966,-1.300966,0.363964,-3.574439,3.509799e-04,3.347909e-03
15580,Klhl11,11.765046,2.796721,0.770730,3.628665,2.848902e-04,2.849275e-03
15589,Ccdc80,98.677296,2.029478,0.511423,3.968299,7.238740e-05,9.565095e-04
15594,Tle2,207.998228,-1.676675,0.384850,-4.356703,1.320359e-05,2.534758e-04


In [8]:
ground_truth_diffusion_graph = get_connected_seed_genes_graph(ground_truth_fold_change, "ground_truth_diffusion_graph")

Processing seed gene: Ctbs
added seed node Ctbs 1.068017594588506 0.000555933066725 0.0048511437760768
Processing seed gene: Jak3
added seed node Jak3 1.0097585910739342 0.0049597753674412 0.0260392472163358
Processing seed gene: Atp6v0d2
added seed node Atp6v0d2 3.264207193581633 0.0010564794410694 0.0080579656319846
Processing seed gene: Cdk10
added seed node Cdk10 -0.949429168522395 0.0014508936659679 0.0102421550092518
Processing seed gene: Fgg
added seed node Fgg 1.3882231937659577 0.0005847093546772 0.005044564257255
Processing seed gene: Tgs1
added seed node Tgs1 0.60085778674308 0.0096855890197351 0.0424896405918938
Processing seed gene: nan
added seed node nan 1.5338158938553774 0.0101662121936273 0.0440535861723851
Processing seed gene: Snrpb
added seed node Snrpb 0.8189105444309265 0.0072661939756376 0.0344185761813102
Processing seed gene: Idh3b
added seed node Idh3b -0.641791453754649 0.005968752353598 0.0298574560539537
Processing seed gene: Erap1
added seed node Erap1 0.

In [None]:
diffusion_graph = copy.deepcopy(ground_truth_diffusion_graph)

# Lets set to 0 the fold change of nodes with less than 2-fold change so we can check if the diffusion works
for node in diffusion_graph.nodes(data=True):
    if abs(float(node[1]["fold_change"])) < 2.0:
        node[1]["fold_change"] = 0.0

In [14]:
deg_df = deg_utils.preprocess_bulk_rna_seq_data(0.05, 0.585, contrast=["group", "302b", "Aging"], file=config.aging_vs_302_PROCESSED_BULK_RNA_SEQ_FILE)  # log2fc of 0.585 corresponds to a 1.5-fold change
downregulated_genes = deg_utils.get_downregulated_genes(deg_df)
downregulated_graph = get_gene_graph(downregulated_genes.to_dict(orient="records"), config.LIVER_DOWNREGULATED_GRAPH_FILE_NAME)

Loaded graph '/home/emiliano/Desktop/Università/Network Analysis/Project/Code/data/graphs/liver_downregulated_graph': 6858 nodes, 21862 edges


In [22]:
for node in downregulated_graph.nodes(data=True):
    if abs(float(node[1]["fold_change"])) < 2.0:
        node[1]["fold_change"] = 0.0

In [None]:
dill.dump_session("03_session.pkl")

In [5]:
import pandas as pd
import numpy as np
from src.data_collection import deg_utils, stringdb_api
from src.analysis import clustering_utils
import networkx as nx
from src.graph.graph_util import *
from src.graph import RWR
import config.config as config
from src.visualization.plots import plot_dist
from cdlib import algorithms
from IPython.display import Image, display
import copy

import dill
A = np.array([
    [0, 1, 1, 0, 0],
    [1, 0, 0, 0, 0],
    [1, 0, 0, 1, 0],
    [0, 0, 1, 0, 1],
    [0, 0, 0, 1, 0]
])

fc_vector = pd.Series({
    0: 10.0,
    1: 0.0,
    2: 0.0,
    3: 0.0,
    4: 0.0
})

restart_probability = 0.3
n_iter = 5
diffusion_weight = 0.5

fc_vector = RWR.random_walk_with_restart(adjacency_matrix=A, fc_vector=fc_vector, restart_probability=restart_probability, n_iter=n_iter, diffusion_weight=diffusion_weight)
fc_vector


-------------------------------------------------- 

Selected node to visit: 0
Fold change vector:  [10.0, 0.0, 0.0, 0.0, 0.0]
Selected node neighbors vector: [0 1 1 0 0]
Diffusion result:  0.0
Updated fold change vector:  [10.0, 0.0, 0.0, 0.0, 0.0]
Updated set of Visited nodes: {0}
Selected node neighbors indices: [np.int64(1), np.int64(2)]
Unvisited neighbors indices: {np.int64(1), np.int64(2)}

-------------------------------------------------- 


-------------------------------------------------- 


RESTARTING DIFFUSION


-------------------------------------------------- 


-------------------------------------------------- 

Selected node to visit: 0
Fold change vector:  [10.0, 0.0, 0.0, 0.0, 0.0]
Selected node neighbors vector: [0 1 1 0 0]
Diffusion result:  0.0
Updated fold change vector:  [10.0, 0.0, 0.0, 0.0, 0.0]
Updated set of Visited nodes: {0}
Selected node neighbors indices: [np.int64(1), np.int64(2)]
Unvisited neighbors indices: {np.int64(1), np.int64(2)}

------------

0    12.50
1    11.25
2     0.00
3     0.00
4     0.00
dtype: float64