In [1]:
import matplotlib.pyplot as plt
from matplotlib import cm

import seaborn as sns
import pandas as pd
import numpy as np
from plotnine import * 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from itertools import combinations_with_replacement
import glob, os
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from matplotlib.colors import rgb2hex, colorConverter


def symmetrize(a):
    return a + a.T - np.diag(a.diagonal())

information_frame  = pd.read_csv('~/FlowRepository_FR-FCM-ZYVH_files/attachments/Metadata_DC.csv')
path_LDA_to_analyse = '~/Microbiota/Genus_LDA/Data_frame_multiple_run_LDA'
path_store_figre = '~/Microbiota/Genus_LDA/Figures_LDA_microbiota'


genus_table = pd.read_table('~/FlowRepository_FR-FCM-ZYVH_files/attachments/GenusAbundance_DiseaseCohort_nature24460.txt', index_col=0, header=0)
genus_comp = genus_table.div(genus_table.sum(axis=1), axis=0)
sumgenus_table = genus_table.div(genus_table.sum(axis=1), axis=0).sum(axis=0)


os.chdir(path_LDA_to_analyse)
topic_to_test = [8]
r = 2
runs = 40


for topic in topic_to_test:
    array_to_add = np.zeros((len(information_frame['Individual'].to_list())+1,len(information_frame['Individual'].to_list())+1))
    list_of_dataframe_for_individual = []
    for file in glob.glob(f"*{runs}_runs_{topic}_topics.csv"):
        dataframe_to_append = []
        dataframe_topic =  pd.read_csv(path_LDA_to_analyse + f'/{file}', index_col=0)
        for top in range(1,topic+1):
            dataframe_to_append.append(pd.DataFrame({f'{top}':dataframe_topic.loc[(dataframe_topic['Patient Statut'] == top)]['Individual'].to_list()}))
        list_of_dataframe_for_individual.append(pd.concat(dataframe_to_append, axis=1))
    
    for run_topic_frame in list_of_dataframe_for_individual:
        list_of_columns = run_topic_frame.columns.to_list()
        patients_selected = []
        for column in list_of_columns:
            list_of_patient = run_topic_frame[column].dropna().to_list()
            patients_selected.extend(list_of_patient)

        patients_selected_number = [int(s.replace("DC", "")) for s in patients_selected]
        for column in list_of_columns:
            list_of_patient = run_topic_frame[column].dropna().to_list()
            list_of_patient_number_only = [int(s.replace("DC", "")) for s in list_of_patient]
            patient_iterated_number_only = list(combinations_with_replacement(list_of_patient_number_only, r))
            for patient in patient_iterated_number_only:
                array_to_add[patient] += 1
    array_to_add = symmetrize(array_to_add)
    array_to_add = np.delete(array_to_add, 0, axis=1)
    array_to_add = np.delete(array_to_add, 0, axis=0)

    df_for_network = pd.DataFrame(array_to_add,columns=information_frame['Individual'].to_list(), index=information_frame['Individual'].to_list())
    dataframe_to_use = df_for_network.loc[:, (df_for_network != 0).any(axis=0)]
    dataframe_to_use = dataframe_to_use.loc[~(dataframe_to_use==0).all(axis=1)]
    dataframe_to_use.to_csv(f'~/Microbiota/Genus_LDA/Dataframe_for_network_{topic}_topic_{runs}_run.csv')




In [2]:
threshold_links = 0.1
dataframe_four_tops = pd.read_csv(f'~/Microbiota/Genus_LDA/Dataframe_for_network_8_topic_{runs}_run.csv', index_col=0)
test = dataframe_four_tops/runs
test[test<threshold_links]=0


In [None]:
import networkx as nx
import matplotlib.pyplot as plt


plt.figure(figsize=(5, 5))

G = nx.from_numpy_matrix(test.values)
G = nx.relabel_nodes(G, dict(enumerate(test.columns)))
my_pos = nx.spring_layout(G, seed = 100)
nx.draw_networkx(G, pos=my_pos)
plt.tight_layout()


In [None]:
information_frame['Health status binary'] = LabelEncoder().fit_transform(information_frame['Health status'])
N_colors=2
cm_dis=np.linspace(0, 0.8 ,N_colors) 
colors = [ cm.RdBu(x) for x in cm_dis]
color_edges=[]
fig, axes = plt.subplots( figsize=(30,20))

for node in G:
    temp=information_frame.loc[information_frame['Individual']==node] #Finding time of node 
    
    color=colors[int(temp['Health status binary'])]
    if color not in color_edges:
        plt.scatter([],[],color=color, label=temp['Health status'].values[0])
    color_edges.append(color)

weights = [20*(G[u][v]['weight'])**4 for u,v in G.edges()]

#nx.draw(G, pos, edges=edges, edge_color=colors, width=weights)
d = dict(G.degree)
nx.draw(G,pos = my_pos,with_labels=False,node_color=color_edges,node_size=[v * 100 for v in d.values()],width=weights)
axes.legend(loc = 'lower left', markerscale=4,scatterpoints=1,bbox_to_anchor=(0, 0), ncol = 1, prop = {'size' : 45})
plt.tight_layout()
plt.savefig(path_store_figre + '/Network_of_patient_healthy_stats.svg', format = 'svg', bbox_inches='tight')

In [None]:
information_frame['Enterotype binary'] = LabelEncoder().fit_transform(information_frame['Enterotype'])

N_colors=4
cm_dis=np.linspace(0, 0.8 ,N_colors) 
colors = [ cm.tab20c(x) for x in cm_dis]
color_edges=[]
fig, axes = plt.subplots( figsize=(30,20))

for node in G:
    temp=information_frame.loc[information_frame['Individual']==node] #Finding time of node 
    
    color=colors[int(temp['Enterotype binary'])]
    if color not in color_edges:
        plt.scatter([],[],color=color, label=temp['Enterotype'].values[0])
    color_edges.append(color)

weights = [20*(G[u][v]['weight'])**4 for u,v in G.edges()]
d = dict(G.degree)


nx.draw(G,pos= my_pos, with_labels=False,node_color=color_edges,width=weights, node_size=[v * 100 for v in d.values()])
axes.legend(loc = 'lower left', markerscale=6,scatterpoints=1,bbox_to_anchor=(0, 0), ncol = 1, prop = {'size' : 50})
plt.tight_layout()
plt.savefig(path_store_figre + '/Network_of_patient_microbiota_stats.svg', format = 'svg', bbox_inches='tight')