In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import community
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import label_propagation_communities
import collections

<center> <h1> Comparison between the pre-network and the post-network </h1></center>

## Networks generated by SparCC:
1. __pre:__ '../data/1125/Rectum_SparCC/rectum_pre_out_edges.txt'
2. __post:__ '../data/1125/Rectum_SparCC/rectum_post_out_edges.txt'
3. __all:__ '../data/1125/Rectum_SparCC/rectum_all_out_edges.txt'

<center><h2>Basic comparison</center></h2>

### Results:

__Pre-Network__ <br>
`Number of nodes: 665` <br>
`Number of edges: 20358` <br>
`Average degree:  61.2271` <br>

__Post-Network__<br>
`Number of nodes: 674` <br>
`Number of edges: 12595` <br>
`Average degree:  37.3739`<br>

__Number of common OTUs between pre-network and post-network__: `632`

In [2]:
# transform txt to edgelist file
pre_txt = open('../data/1125/Rectum_SparCC/rectum_pre_out_edges.txt', 'r')
post_txt = open('../data/1125/Rectum_SparCC/rectum_post_out_edges.txt', 'r')

pre_edgelist = open('../data/1125/Rectum_SparCC/rectum_pre_out_edges.edgelist', 'w')
post_edgelist = open('../data/1125/Rectum_SparCC/rectum_post_out_edges.edgelist', 'w')

#----------------------------------Pre-----------------------------------------

# skip the header
next(pre_txt)

# Ok, here, in the original file, the OTUid of each OTU is in the format: "1002005", and the double quote is included
# This is not very good, therefore, I didn't write the double quote into the edgelist file
for line in pre_txt:
    pre_edgelist.write(line.split('\t')[0][1:-1] + ' ' + line.split('\t')[1][1:-1] + '\n')
    
pre_txt.close()
pre_edgelist.close()


#----------------------------------Post-----------------------------------------
# skip the header
next(post_txt)

for line in post_txt:
    post_edgelist.write(line.split('\t')[0][1:-1] + ' ' + line.split('\t')[1][1:-1] + '\n')
    
post_txt.close()
post_edgelist.close()

In [3]:
# Read the network
pre_fname = '../data/1125/Rectum_SparCC/rectum_pre_out_edges.edgelist'
post_fname = '../data/1125/Rectum_SparCC/rectum_post_out_edges.edgelist'

pre_network = nx.read_edgelist(pre_fname)
post_network = nx.read_edgelist(post_fname)

# Print network information
print("pre-network: ")
print(nx.info(pre_network))

print('-----------------------')

print('post-network: ')
print(nx.info(post_network))

pre-network: 
Name: 
Type: Graph
Number of nodes: 665
Number of edges: 20358
Average degree:  61.2271
-----------------------
post-network: 
Name: 
Type: Graph
Number of nodes: 674
Number of edges: 12595
Average degree:  37.3739


<center><h2>Clauset-Newman-Moore'a Algorithm (Modularity Optimization)</h2></center>

In [4]:
# format: [community : (v1, v2, ..., vn)]
pre_cluster = greedy_modularity_communities(pre_network)
print("Number of clusters in the pre-network:", len(pre_cluster)) # 4

post_cluster = greedy_modularity_communities(post_network)
print("Number of clusters in the post-network:", len(post_cluster)) # 6

Number of clusters in the pre-network: 4
Number of clusters in the post-network: 6


In [5]:
for c, v in enumerate(pre_cluster):
    print(c, len(v), sep = ' : ')
    
print("---------------------")

for c, v in enumerate(post_cluster):
    print(c, len(v), sep = ' : ')

0 : 271
1 : 269
2 : 108
3 : 17
---------------------
0 : 428
1 : 156
2 : 77
3 : 6
4 : 4
5 : 3


In [6]:
#pre_OTUID, post_OTUID, format: {vertex : OTUID}

pre_list = list()  
post_list = list()

subgroup_clauset = []

for c1, v1 in enumerate(pre_cluster):
    for v in v1:
        pre_list.append(v)
    for c2, v2 in enumerate(post_cluster):
        print("pre-network cluster # {} & post-network cluster # {}".format(c1, c2))
        for v in v2:
            post_list.append(v)
        common = list(set(pre_list).intersection(post_list))
        print("number of common OTU: {}".format(len(common)))
        print("-------------------------------------")
        
        if len(common) >= 8:
            subgroup_clauset.append(common)
        
        post_list = list()
    pre_list = list()

pre-network cluster # 0 & post-network cluster # 0
number of common OTU: 192
-------------------------------------
pre-network cluster # 0 & post-network cluster # 1
number of common OTU: 37
-------------------------------------
pre-network cluster # 0 & post-network cluster # 2
number of common OTU: 30
-------------------------------------
pre-network cluster # 0 & post-network cluster # 3
number of common OTU: 2
-------------------------------------
pre-network cluster # 0 & post-network cluster # 4
number of common OTU: 3
-------------------------------------
pre-network cluster # 0 & post-network cluster # 5
number of common OTU: 1
-------------------------------------
pre-network cluster # 1 & post-network cluster # 0
number of common OTU: 121
-------------------------------------
pre-network cluster # 1 & post-network cluster # 1
number of common OTU: 101
-------------------------------------
pre-network cluster # 1 & post-network cluster # 2
number of common OTU: 28
------------

In [9]:
# A total of 10 groups
print(len(subgroup_clauset))

for c in subgroup_clauset:
    print("Number of OTUs in this group: {}: ".format(len(c)))
    print(c)
    print("------------------")

10
Number of OTUs in this group: 192: 
['809486', 'New.0.ReferenceOTU411', '349901', '585419', '1082059', '1104529', '589792', 'New.0.ReferenceOTU384', '191237', 'New.0.CleanUp.ReferenceOTU145599', '689950', '1082607', '593209', 'New.0.ReferenceOTU369', '836783', 'New.0.CleanUp.ReferenceOTU21074', '573035', '874750', '4307652', '996116', '1033018', '1009362', 'New.0.ReferenceOTU660', '581982', 'New.0.CleanUp.ReferenceOTU42791', '539735', '1080820', 'New.3.ReferenceOTU478', '1048194', '27938', '1011954', 'New.0.CleanUp.ReferenceOTU109358', 'New.0.CleanUp.ReferenceOTU157', '812496', '790545', 'New.0.ReferenceOTU709', 'New.3.ReferenceOTU59', '540269', '830290', 'New.0.CleanUp.ReferenceOTU32288', '362965', '559613', 'New.0.ReferenceOTU547', 'New.3.ReferenceOTU105', '579608', '276149', '176591', '1082539', '1084952', '1009894', 'New.0.ReferenceOTU329', '807369', '925494', '805151', '1074210', '543151', '589597', '558885', 'New.0.ReferenceOTU323', '540737', '4395263', '1047041', 'New.0.Refer

In [71]:
# name of the file: shedd.IL.core.0.0001.prob.txt

prob_fo = open('shedd.IL.core.0.0001.prob.txt')

# skip the two header lines
next(prob_fo)
next(prob_fo)

'''
format: line.split('\t')[0] is the OTUid, line.split('\t')[-1] is the taxonomy information separated by '; ' (note that it is a
semicolon followed by a whitespace). Therefore
line.split('\t')[-1].split('; ') is the array of taxonomy information of a specific OUT
Example: ['k__Bacteria', ' p__Bacteroidetes', ' c__Flavobacteriia', ' o__Flavobacteriales', ' f__Flavobacteriaceae', ' g__Mesonia', ' s__\n']
Note that the last element of the array is \n, also, taxonomic information only contains: 
    class, order, family, genus
which are indices [2][3][4][5]
'''

# Here why I am doing this, for many OUTs, the taxonomy inforamtion are missing. For example, some don't have order information
# while others don't have genus info(acutally, a lot of them don't have genus information).
taxonomy_mapping = {'c': 'class', 'o' : 'order', 'f' : 'family', 'g' : 'genus'}
l = ['c', 'o', 'f', 'g']

# format: { OUTID : {class : c_xxx, order : o_xxx, family : f_xxx, genus : g_xxx} }
OTU_info = defaultdict(dict)

for line in prob_fo:
    for tax in line.split('\t')[-1].split('; '):
        if tax[0] in l:
            OTU_info[ line.split('\t')[0] ][taxonomy_mapping[tax[0]] ] = tax


In [73]:
print(OTU_info['113013'])

{'class': 'c__Flavobacteriia', 'order': 'o__Flavobacteriales', 'family': 'f__Flavobacteriaceae', 'genus': 'g__Mesonia'}
