In [31]:
import numpy as np
import pandas as pd 
from functools import reduce
import json
from flask import Flask, render_template

# step0 import data
root = "./analysis/discovery3KGOClusterSub/{}"
clusterRandIndexPath = root.format("moduleRandIndex.csv")
GOAncestorPath = "./analysis/GO_Ancestor.csv"
clusterRandIndex = pd.read_csv(clusterRandIndexPath,  index_col=0)
GOAncestor = pd.read_csv(GOAncestorPath)
clusterRandIndex

Unnamed: 0,cutHeight,minClusterSize,randIndex
1,0.99,10,0.01277
2,0.99,20,0.01277
3,0.99,30,0.01277
4,0.991,10,0.01277
5,0.991,20,0.01277
6,0.991,30,0.01277
7,0.992,10,0.01277
8,0.992,20,0.01277
9,0.992,30,0.01277
10,0.993,10,0.01277


In [33]:
GOAncestor

Unnamed: 0,GO,Name,Ancestor,AncestorGO
0,GO:0000001,mitochondrion inheritance,cellular localization,GO:0051641
1,GO:0000001,mitochondrion inheritance,cellular component organization or biogenesis,GO:0071840
2,GO:0000002,mitochondrial genome maintenance,cellular component organization or biogenesis,GO:0071840
3,GO:0000011,vacuole inheritance,cellular component organization or biogenesis,GO:0071840
4,GO:0000012,single strand break repair,nitrogen compound metabolic process,GO:0006807
...,...,...,...,...
158445,GO:2001316,kojic acid metabolic process,organic substance metabolic process,GO:0071704
158446,GO:2001317,kojic acid biosynthetic process,biosynthetic process,GO:0009058
158447,GO:2001317,kojic acid biosynthetic process,cellular metabolic process,GO:0044237
158448,GO:2001317,kojic acid biosynthetic process,small molecule metabolic process,GO:0044281


In [34]:
# step1 find the cutHeight and minCluster size
selectedParameter = clusterRandIndex.nlargest(1, "randIndex")
cutHeight = selectedParameter["cutHeight"].values[0]
minClusterSize = selectedParameter["minClusterSize"].values[0]
print(cutHeight, minClusterSize)

0.999 10


In [35]:
# step2 Cluster with fixed cutHeight and minCluster size 
ClusterWithFixedParaPath = root.format("_trueModuleCluster_" + str(cutHeight) + "_" + str(minClusterSize) + "_.csv")
cluster = pd.read_csv(ClusterWithFixedParaPath, index_col=0).rename(columns={"Label": "Cluster"})
cluster.GO = cluster.GO.str.replace(".", ":")
cluster

  cluster.GO = cluster.GO.str.replace(".", ":")


Unnamed: 0,GO,AncestorGroup,AncestorGroupIndex,Color,Cluster
1,GO:0000002,cellular component organization or biogenesis,1344,239,1
2,GO:0000012,"nitrogen compound metabolic process,cellular m...",3194,596,0
3,GO:0000027,cellular component organization or biogenesis,1344,239,1
4,GO:0000028,cellular component organization or biogenesis,1344,239,0
5,GO:0000038,"cellular metabolic process,primary metabolic p...",1564,314,1
...,...,...,...,...,...
496,GO:0007018,"movement of cell or subcellular component,micr...",2343,450,1
497,GO:0007019,"microtubule-based process,cellular component o...",2149,421,0
498,GO:0007020,"microtubule-based process,cellular component o...",2149,421,1
499,GO:0007026,"microtubule-based process,negative regulation ...",2154,422,1


In [38]:
# step3 select cluster with the threhold at 50  
dataList = []
for clusterIndex in cluster.Cluster.unique():
    subCluster = cluster[cluster.Cluster.eq(clusterIndex)]
    data = GOAncestor[GOAncestor.GO.isin(subCluster.GO)]
    data_pivot = data.pivot(index='GO', columns='Ancestor', values='GO')
    percent = 100 - (data_pivot.isnull().sum() * 100 / len(data_pivot))
    missing_value_df = pd.DataFrame({'column_name': data_pivot.columns,
                                    'percent': percent})
    data = missing_value_df.nlargest(1, "percent")[["percent"]]
    data["Cluster"] = clusterIndex
    dataList.append(data)
data = reduce(lambda df1,df2: pd.concat([df1, df2]),  dataList).sort_values(by= ["Cluster"])
data["Count"] = cluster.groupby(["Cluster"]).agg("count")["GO"].values
data = data.sort_values(by=["percent"])
selectedCluster = data[data.percent.gt(50)]
selectedCluster

Unnamed: 0_level_0,percent,Cluster,Count
Ancestor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
organic substance metabolic process,54.189944,0,180
cellular metabolic process,100.0,2,13


In [40]:
# step4 find the GOs In the above cluster
listGO = []
for Cluster in selectedCluster.Cluster.unique():
    result =  GOAncestor[GOAncestor.GO.isin(cluster[cluster.Cluster.eq(Cluster)].GO)].drop_duplicates(subset=["Name"])
    result["Cluster"] = Cluster
    # print(selectedCluster[selectedCluster.Cluster.eq(Cluster)].index.values)
    result["Type"] = str(selectedCluster[selectedCluster.Cluster.eq(Cluster)].index[0])
    listGO.append(result)
GOs = reduce(lambda df1,df2: pd.concat([df1, df2]),  listGO).sort_values(by= ["Cluster"])
GOs = GOs.drop(columns=[ "Ancestor", "AncestorGO"])
GOs

Unnamed: 0,GO,Name,Cluster,Type
4,GO:0000012,single strand break repair,0,organic substance metabolic process
12249,GO:0006370,7-methylguanosine mRNA capping,0,organic substance metabolic process
12258,GO:0006378,mRNA polyadenylation,0,organic substance metabolic process
12266,GO:0006382,adenosine to inosine editing,0,organic substance metabolic process
12290,GO:0006388,"tRNA splicing, via endonucleolytic cleavage an...",0,organic substance metabolic process
...,...,...,...,...
1180,GO:0000727,double-strand break repair via break-induced r...,2,cellular metabolic process
1168,GO:0000724,double-strand break repair via homologous reco...,2,cellular metabolic process
1163,GO:0000723,telomere maintenance,2,cellular metabolic process
11893,GO:0006270,DNA replication initiation,2,cellular metabolic process


In [41]:
# step5 calculate the correlation between the GOs
AgeGapPerGOPath = "discovery3KAgeGapPerGOSub.csv"
AgeGapPerGO = pd.read_csv(AgeGapPerGOPath, index_col=0)
AgeGapPerGO = AgeGapPerGO[list(GOs.GO.values)]
heatmap = AgeGapPerGO.corr()
upper_triangle_values =  heatmap.mask(np.triu(np.ones(heatmap.shape)).astype(bool)).fillna(0)
upper_triangle_values

Unnamed: 0,GO:0000012,GO:0006370,GO:0006378,GO:0006382,GO:0006388,GO:0006390,GO:0006398,GO:0006402,GO:0006418,GO:0006367,...,GO:0006298,GO:0006284,GO:0006310,GO:0006268,GO:0006260,GO:0000727,GO:0000724,GO:0000723,GO:0006270,GO:0006471
GO:0000012,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
GO:0006370,0.049341,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
GO:0006378,0.070961,0.003836,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
GO:0006382,0.056345,0.019158,0.116190,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
GO:0006388,0.046119,0.002030,0.410208,0.086604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:0000727,0.064204,0.034033,0.099053,0.065347,0.061698,0.038185,0.097048,0.099018,0.018187,0.107491,...,0.100078,0.105788,0.121308,0.447384,0.267345,0.000000,0.000000,0.000000,0.000000,0.0
GO:0000724,0.055913,0.021844,0.200475,0.128937,0.139109,0.151010,0.122891,0.207820,0.061002,0.189756,...,0.482623,0.455134,0.557668,0.473411,0.561821,0.148802,0.000000,0.000000,0.000000,0.0
GO:0000723,0.046963,0.013090,0.188692,0.098206,0.114406,0.127186,0.092199,0.174965,0.084472,0.203067,...,0.468270,0.498767,0.383007,0.370897,0.508402,0.112793,0.583297,0.000000,0.000000,0.0
GO:0006270,0.037398,0.041613,0.115131,0.084389,0.085501,0.067931,0.126112,0.135955,0.043796,0.143153,...,0.124710,0.140626,0.178601,0.491169,0.304174,0.676463,0.197531,0.150923,0.000000,0.0


In [42]:
# step6 prepare data for plot
nodes = GOs[["GO", "Cluster"]].rename(columns={"GO": "id", "Cluster": "group"})
print(GOs.Cluster.unique())
nodes = nodes.to_dict(orient='records')
link = []
for index, row in upper_triangle_values.iterrows():
    for column in upper_triangle_values.columns:
        if row[column] < -0.25 or row[column] > 0.25:
            dict = {}
            # print(f"Row {index}, Column {column}: Value is not equal to 1 ({row[column]})")
            dict["source"] = index 
            dict["target"] = column
            dict["value"] =  row[column]
            link.append(dict)
result = {}
result["nodes"] = nodes
result["links"] = link
file_name = "./dataSub.json"

[0 2]


In [None]:
# step7 plot in D3
# run the code below and open the website http://127.0.0.1:5000
with open(file_name, "w") as json_file:
    json.dump(result, json_file)
app = Flask(__name__)
@app.route('/')
def index():
    return render_template('./index.html')
if __name__ == '__main__':
    app.run()