In [79]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import community
from networkx.algorithms.community import greedy_modularity_communities
import collections
import csv

In [80]:
# main network files
pre_fname='../data/RectumMicrobiome_PrePost/stool_network_pre.gml.txt.network'
post_fname='../data/RectumMicrobiome_PrePost/stool_network_post.gml.txt.network'

#pre_wG : the pre network loaded by networkx
pre_wG = nx.read_gml(pre_fname, label='id')
print("pre-network:")
print(nx.info(pre_wG))

print('-----------------------')

# post_wG : the post network loaded by networkx
post_wG = nx.read_gml(post_fname, label='id')
print("post-network:")
print(nx.info(post_wG))

pre-network:
Name: 
Type: Graph
Number of nodes: 357
Number of edges: 5605
Average degree:  31.4006
-----------------------
post-network:
Name: 
Type: Graph
Number of nodes: 256
Number of edges: 1789
Average degree:  13.9766


In [81]:
# Get the OTU ids
pre_OTUID = nx.get_node_attributes(pre_wG,'OTUID') # format: OTUID[vertex_id] = OTUID
post_OTUID = nx.get_node_attributes(post_wG,'OTUID') # format: OTUID[vertex_id] = OTUID

In [82]:
# format: [community : (v1, v2, ..., vn)], this encodes the inforamtion we neede
pre_cluster = greedy_modularity_communities(pre_wG)
print("Number of clusters in the pre-network: {}".format(len(pre_cluster))) # 8

post_cluster = greedy_modularity_communities(post_wG)
print("Number of clusters in the pre-network: {}".format(len(post_cluster))) # 9

Number of clusters in the pre-network: 8
Number of clusters in the pre-network: 9


In [83]:
# The format of the pre_cluster defined above is : [Community_number : (vertex_id1, vertex_id2, ...)]
# But, we need OUTID

#format {cluster_number_0: [OTUID1, OTUID2, ...], cluster_number_1: [...], ...}
pre_cluster_otuid = defaultdict(list)
post_cluster_otuid = defaultdict(list)

# pre-network
for i, cluster in enumerate(pre_cluster):
    for vertexid in cluster:
        pre_cluster_otuid[i].append(pre_OTUID[vertexid])

# post-network
for i, cluster in enumerate(post_cluster):
    for vertexid in cluster:
        post_cluster_otuid[i].append(post_OTUID[vertexid])

In [84]:
pre_total_count_fo = open('task3/total_count_pre.csv', 'r')
csv_pre_reader = csv.reader(pre_total_count_fo, delimiter=',')

# I need this for fast access the count of each OTU
# format: {OUTID: counts}
pre_total_count = dict()

for i in list(csv_pre_reader)[1:]:
        #check if the list is empty, for some reason, csvreader reads \n character as an empty list
        if i:
            pre_total_count[i[0]] = int(i[1])

pre_total_count_fo.close()


post_total_count_fo = open('task3/total_count_post.csv', 'r')
csv_post_reader = csv.reader(post_total_count_fo, delimiter=',')

# I need this for fast access the count of each OTU
# format: {OUTID: counts}
post_total_count = dict()

for i in list(csv_post_reader)[1:]:
    #check if the list is empty, for some reason, csvreader reads \n character as an empty list
    if i:
        post_total_count[i[0]] = int(i[1])

post_total_count_fo.close()


In [85]:
# Time to construct files which consist of counts of OUTs in each cluster
for cluster_number, otus in pre_cluster_otuid.items():
    pre_cluster_fo = open("WGCNA_cluster_count/pre_{}.csv".format(cluster_number), 'w')
    pre_cluster_fo.write("OTU,Total_sample_counts\n")
    for otu in otus:
        pre_cluster_fo.write("otu_" + otu + ',' + str(pre_total_count[otu])+'\n')
    
    pre_cluster_fo.close()

for cluster_number, otus in post_cluster_otuid.items():
    post_cluster_fo = open("WGCNA_cluster_count/post_{}.csv".format(cluster_number), 'w')
    post_cluster_fo.write("OTU,Total_sample_counts\n")
    for otu in otus:
        post_cluster_fo.write("otu_" + otu + ',' + str(post_total_count[otu])+'\n')
    
    post_cluster_fo.close()

In [86]:
# Time for DNA
dna = open("task3/unchimeric_rep_set.fasta",'r') 

# dna sequences is the same for the same OTU regardles of whether it is pre/post network
# format: {OUTID : sequence}
dna_info = dict()

c = 1
for row in dna:
    if c % 2 == 1:
        # we skip the \n char
        dna_info[row[1:-1]] = dna.readline()[:-1] # same, the last character is a newline character
        c += 1
    c += 1    

In [87]:
# pre-network
for cluster_number, otus in pre_cluster_otuid.items():
    pre_cluster_fo = open("WGCNA_cluster_dna/pre_{}.fasta".format(cluster_number), 'w')
    
    for otu in otus:
        pre_cluster_fo.write('>otu_' + otu + '\n')
        pre_cluster_fo.write(dna_info[otu] + '\n')
    
    pre_cluster_fo.close()

# post-network
for cluster_number, otus in post_cluster_otuid.items():
    post_cluster_fo = open("WGCNA_cluster_dna/post_{}.fasta".format(cluster_number), 'w')
    
    for otu in otus:
        post_cluster_fo.write('>otu_' + otu + '\n')
        post_cluster_fo.write(dna_info[otu] + '\n')
    
    post_cluster_fo.close()