In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from scipy.stats import spearmanr
import time as tm
from collections import OrderedDict
# Custom libraries
import CCM_GAH_lib as ccm
import data_handle_lib as dh

In [2]:
def converter_space(instr):
    return np.fromstring(instr[1:-1], sep = ',')
def converter_comma(instr):
    return np.fromstring(instr[1:-1], sep =' ')

#### Generates nodes and edges from a given data set, given certain limits (on the correlation coefficients, mainly)

In [3]:
# Select which data file to analyze
data_path = 'mgp93_M_gut_spearman_20190126_024548.csv'
timestr = tm.strftime("%Y%m%d_%H%M%S")
output_edges_name = "mgp93_M3_gut_edges_" + timestr+ ".csv"
output_nodes_name = "mgp93_M3_gut_nodes_" + timestr+ ".csv"

In [4]:
df_data_analysis = pd.read_csv(data_path, converters={"pearson_coeff":converter_space, "L":converter_comma})

In [5]:
# Conditions for causality
df_data_analysis = df_data_analysis[(df_data_analysis.loc[:, "spearman_coeff_p"]<0.05)\
                                    & (df_data_analysis.loc[:, "pearson_coeff_last"]>0.5)]

In [6]:
# Nodes
df_nodes_temp_1, df_nodes_temp_2 = pd.DataFrame(), pd.DataFrame()
df_nodes_temp_1[["ID", "label"]] = df_data_analysis[["x_ID", "x_name"]]
df_nodes_temp_2[["ID", "label"]] = df_data_analysis[["y_ID", "y_name"]]
df_nodes = pd.concat([df_nodes_temp_1, df_nodes_temp_2])

df_nodes.drop_duplicates(inplace=True)
df_nodes.reset_index(inplace=True, drop=True)
df_nodes.loc[:,"ID"] = df_nodes.loc[:,"ID"].astype(int)

In [7]:
# Edges
df_edges = pd.DataFrame(columns=["source", "target"])
df_edges[["source", "target"]] = df_data_analysis[["y_ID", "x_ID"]].astype(int)

In [8]:
# Output to CSV
timestr = tm.strftime("%Y%m%d_%H%M%S")
df_nodes.to_csv(output_nodes_name, index=False)
df_edges.to_csv(output_edges_name, index=False)