In [27]:
# Import requirement libraries

import networkx as nx
import collections
import pandas as pd
import math
import csv

In [28]:
# read data
#df_L = pd.read_csv("WLUSC.csv")
df_L=pd.read_excel("PPI_DIP dataset..xlsx")

df_L.head(10)

Unnamed: 0,Q0050,YER142C
0,Q0080,YBL099W
1,Q0080,YJR121W
2,Q0085,YBL099W
3,Q0085,YDR298C
4,Q0085,YDR377W
5,Q0085,YJR121W
6,Q0085,YKL016C
7,Q0085,YML081C-A
8,Q0085,YPL078C
9,Q0130,YBL099W


In [29]:
# Create the directed and weighted graph using Source and Target for connections.

G_L = nx.from_pandas_edgelist(df_L, 'Q0050', 'YER142C', create_using=nx.DiGraph())

# Info of network

print('Lung Cancer Network:')
print(nx.info(G_L))
G_L.is_directed()

Lung Cancer Network:
DiGraph with 5092 nodes and 24742 edges



  print(nx.info(G_L))


True

In [30]:
# Check connected or disconnected network --> directed--> Strongly/Weakly connectivity.

print('Is Lung Cancer Network strongly connected?', nx.is_strongly_connected(G_L))
print('Is Lung Cancer Network weakly connected?', nx.is_weakly_connected(G_L))
print('\n')

#Returns number of strongly connected components in graph.
print('The number strongly connected components in Lung Cancer Network:', nx.number_strongly_connected_components(G_L))

#Returns the number of weakly connected components in graph.
print('The number weakly connected components in Lung Cancer Network:', nx.number_weakly_connected_components(G_L))

Is Lung Cancer Network strongly connected? False
Is Lung Cancer Network weakly connected? False


The number strongly connected components in Lung Cancer Network: 5092
The number weakly connected components in Lung Cancer Network: 21


In [31]:
# Generate connected components and select the largest:

largest_component_L = max(nx.weakly_connected_components(G_L), key=len)


Gconnected_L = G_L.subgraph(largest_component_L)

# UNFrozed the graph

Gconnected_LL = nx.Graph(Gconnected_L)
# remove the self loop node in the graph 

Gconnected_LL.remove_edges_from(nx.selfloop_edges(Gconnected_LL))
print('Lung Cancer Network')
print(nx.info(Gconnected_L))

Lung Cancer Network
DiGraph with 5051 nodes and 24721 edges



  print(nx.info(Gconnected_L))


In [32]:
# k- core calculation for the largest component of the graph

k_cores = {}          #dictionary to hold proteins and the highest k-core they belong to
highest_kcore =0    #keep track of the hightest recored k-core

        #Build a network
protein_cores = nx.core_number(Gconnected_LL)      #get max k-cores from the network for each protein

 #group proteins based on the highest k-core in the network where each protein belong
for protein, k_core in protein_cores.items():

     if highest_kcore < k_core: #keep track of the highest k-core
         highest_kcore = k_core

     #group the proteins in a dictionary based on highest k_core they belong to
     if k_core in k_cores:
         k_cores[k_core].append(protein)
     else:
         k_cores[k_core]=[protein]    

print("The highest k-core is a {0}-core and there are {1} proteins in that {0}-core. \n"
       "The proteins are: {2}".format(highest_kcore,len(k_cores[highest_kcore]),k_cores[highest_kcore]))


The highest k-core is a 17-core and there are 167 proteins in that 17-core. 
The proteins are: ['YBL099W', 'YJR121W', 'YBR217W', 'YDR523C', 'YOR351C', 'YCL018W', 'YDR171W', 'YGL206C', 'YKR026C', 'YBR118W', 'YML064C', 'YNL064C', 'YPL240C', 'YBR036C', 'YBR183W', 'YCL025C', 'YEL063C', 'YGL200C', 'YGR060W', 'YHR140W', 'YJL117W', 'YPL076W', 'YPL264C', 'YJL066C', 'YPL235W', 'YBR055C', 'YER133W', 'YBR020W', 'YBR127C', 'YDL029W', 'YDL059C', 'YFL037W', 'YFL039C', 'YJR045C', 'YJR077C', 'YML085C', 'YMR058W', 'YMR214W', 'YPR110C', 'YGL190C', 'YHR135C', 'YDR099W', 'YFR053C', 'YKL035W', 'YMR205C', 'YBR290W', 'YDL017W', 'YDR331W', 'YAL021C', 'YBR072W', 'YGL048C', 'YGR092W', 'YJL130C', 'YLR180W', 'YLR259C', 'YBR159W', 'YJR117W', 'YHR030C', 'YBR196C', 'YLR044C', 'YMR186W', 'YOR181W', 'YBR017C', 'YNL189W', 'YLR175W', 'YMR106C', 'YDL126C', 'YJL138C', 'YKL152C', 'YPL061W', 'YJL008C', 'YOR212W', 'YGR260W', 'YDL229W', 'YCL040W', 'YAR007C', 'YJL034W', 'YJL173C', 'YML124C', 'YMR059W', 'YGL237C', 'YHL048W', 'Y

In [33]:
#select all nodes in  the k-cores (k=1,2,......,highest_kcore) of sub-graph
C=1 #Numbers of cores(C=1 all core)
nodes_L2=[]
nodes_L3=[]
nodes_L2 += [ k_cores[key] for key in  k_cores if key>=C]
[nodes_L3.extend(i) for i in nodes_L2];       

In [34]:
# Get network nodes
#nodes_L = list(Gconnected_L.nodes)
print('Lung Cancer Network:')
print(len(nodes_L3))

Lung Cancer Network:
5051


In [35]:
#create sub-graph by k-core nodes

Gconnected_L=Gconnected_L.subgraph(nodes_L3)

In [36]:
# Save them

pd.DataFrame(nodes_L3, columns=['Name']).to_csv("./node_L.csv", sep=',',index=False)

In [37]:
# Calculate degree

print("Node  Degree")
for v in Gconnected_L:
    print(f"{v:4} {Gconnected_L.degree(v):6}")

Node  Degree
Q0080      2
YBL099W    109
YJR121W    127
Q0085      7
YDR298C      5
YDR377W      6
YKL016C      7
YML081C-A      6
YPL078C      9
Q0130      2
Q0250      3
YBR024W      4
YBR037C      3
YER154W      6
Q0275      1
R0020C      3
YBL056W     26
YBR217W     91
YDR523C     45
R0030W      2
YOR101W      6
YOR351C     40
YAL001C      1
YBR123C      8
YAL002W      8
YCL018W    156
YDR171W     61
YGL206C     64
YKR026C     83
YLR148W     19
YLR396C      7
YMR231W      7
YPL045W      7
YAL003W      3
YBR118W    105
YKL081W     29
YML064C    164
YAL004W      2
YGL181W     44
YML109W     34
YAL005C     15
YAR002W     13
YCL028W     22
YDR192C     31
YHR069C     12
YLR310C     71
YLR335W     19
YLR347C     60
YNL007C     16
YNL064C    102
YNL077W      5
YOR098C     45
YOR151C     48
YOR160W      9
YPL240C     63
YPR010C     22
YAL007C     25
YAR002C-A      6
YBR036C     40
YBR183W     34
YCL025C     36
YDL054C     16
YDR087C     12
YDR101C     18
YDR127W     25
YEL002C     38
YEL06

In [38]:
# Calculate degrees and frequencies

degree_sequence_L = sorted(dict(nx.degree(Gconnected_L)).values(),reverse=False) # degree sequence
degreeCount_L = collections.Counter(degree_sequence_L)
degrees_L, frequency_L = zip(*degreeCount_L.items())

print(degrees_L)
frequency_L

(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 75, 76, 78, 79, 80, 81, 83, 84, 86, 89, 91, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 105, 106, 109, 110, 111, 113, 116, 120, 124, 125, 126, 127, 129, 139, 152, 156, 159, 164, 168, 174, 176, 216, 229, 280)


(1006,
 754,
 563,
 385,
 318,
 206,
 188,
 176,
 128,
 103,
 91,
 79,
 72,
 62,
 52,
 69,
 49,
 43,
 42,
 39,
 33,
 38,
 37,
 33,
 22,
 26,
 21,
 20,
 23,
 13,
 14,
 15,
 15,
 19,
 14,
 18,
 20,
 7,
 7,
 12,
 8,
 10,
 8,
 14,
 8,
 4,
 4,
 6,
 7,
 4,
 3,
 6,
 1,
 2,
 2,
 4,
 5,
 4,
 5,
 6,
 3,
 2,
 2,
 6,
 2,
 4,
 7,
 1,
 2,
 1,
 4,
 5,
 3,
 2,
 3,
 2,
 1,
 1,
 1,
 4,
 3,
 2,
 1,
 1,
 1,
 4,
 2,
 3,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1)

In [39]:
# deg_centrality_L = nx.degree_centrality(Gconnected_L)

# degreeC_L = pd.DataFrame(list(deg_centrality_L.items()),
#                        columns = ['Name','degree centrality']) 

# print('Lung Cancer Network:')
# print('Mean Degree Centrality:', degreeC_L['degree centrality'].mean())
# print('Min Degree Centrality:', degreeC_L['degree centrality'].min())
# print('Max Degree Centrality:', degreeC_L['degree centrality'].max())

# idD_L = degreeC_L['degree centrality'].idxmax()
# nameD_L = degreeC_L.Name[degreeC_L['degree centrality'].idxmax()]
# print('Max Degree Centrality is for:', nameD_L, '   with id:', idD_L)

In [40]:
# close_centrality_L = nx.closeness_centrality(Gconnected_L) 

# clsC_L = pd.DataFrame(list(close_centrality_L.items()),
#                       columns = ['Name','closeness centrality']) 

# print('Lung Cancer Network:')
# print('Mean Closeness Centrality:', clsC_L['closeness centrality'].mean())
# print('Min Closeness Centrality:', clsC_L['closeness centrality'].min())
# print('Max Closeness Centrality:', clsC_L['closeness centrality'].max())

# idC_L = clsC_L['closeness centrality'].idxmax()
# nameC_L = clsC_L.Name[clsC_L['closeness centrality'].idxmax()]                  
# print('Max Closeness Centrality is for:', nameC_L, '   with id:', idC_L)

In [41]:
# bet_centrality_L = nx.betweenness_centrality(Gconnected_L, normalized = True, endpoints = False) 

# btC_L = pd.DataFrame(list(bet_centrality_L.items()),
#                      columns = ['Name','betweenness centrality']) 

# print('Lung Cancer Network:')
# print('Mean Betweenness Centrality:', btC_L['betweenness centrality'].mean())
# print('Min Betweenness Centrality:', btC_L['betweenness centrality'].min())
# print('Max Betweenness Centrality:', btC_L['betweenness centrality'].max())

# idB_L = btC_L['betweenness centrality'].idxmax()
# nameB_L = btC_L.Name[btC_L['betweenness centrality'].idxmax()]                     
# print('Max betweenness Centrality is for:', nameB_L, '   with id:', idB_L)

In [42]:
# dfC_L = pd.merge(degreeC_L, clsC_L, how='inner', on= 'Name')
# pd.merge(dfC_L, btC_L, how='inner', on= 'Name').to_csv("./centralities_L.csv", sep=',',index=False)

In [43]:
#Influential Nodes via Information Entropy(EnRenew algorithm)

def EnRenewRank(G, topk, order):
    # N - 1
    all_degree = nx.number_of_nodes(G) - 1
    # avg degree
    k_ = nx.number_of_edges(G) * 2 / nx.number_of_nodes(G)
    # E<k>
    k_entropy = - k_ * ((k_ / all_degree) * math.log((k_ / all_degree)))

    # node's information pi
    node_information = {}
    for node in nx.nodes(G):
        information = (G.degree(node) / all_degree)
        node_information[node] = - information * math.log(information)

    # node's entropy Ei
    node_entropy = {}
    for node in nx.nodes(G):
        node_entropy[node] = 0
        for nbr in nx.neighbors(G, node):
            node_entropy[node] += node_information[nbr]

    rank = []
    for i in range(topk):
        # choose the max entropy node
        max_entropy_node, entropy = max(node_entropy.items(), key=lambda x: x[1])
        rank.append((max_entropy_node, entropy))

        cur_nbrs = nx.neighbors(G, max_entropy_node)
        for o in range(order):
            for nbr in cur_nbrs:
                if nbr in node_entropy:
                        node_entropy[nbr] -= (node_information[max_entropy_node] / k_entropy) / (2**o)
            next_nbrs = []
            for node in cur_nbrs:
                nbrs = nx.neighbors(G, node)
                next_nbrs.extend(nbrs)
            cur_nbrs = next_nbrs

        #set the information quantity of selected nodes to 0
        node_information[max_entropy_node] = 0
        # set entropy to 0
        node_entropy.pop(max_entropy_node)
    return rank


In [44]:
# run EnRenew algorithm on selected node
k=len(nodes_L3)
Entropy=EnRenewRank(Gconnected_L,k,1)
Entropy = pd.DataFrame(Entropy, index=None)
Entropy

Unnamed: 0,0,1
0,YCL018W,5.114608
1,YBR072W,4.286050
2,YBR127C,4.137219
3,YBR020W,3.947574
4,YBR196C,3.900941
...,...,...
5046,YML064C,-25.905045
5047,YPL204W,-28.157949
5048,YNL189W,-29.669243
5049,YLR259C,-31.018589


In [45]:
Entropy.to_csv('Information Entropy.csv',index=False)

In [93]:
#remove nods with entropy<E 


E=0.68

with open('Information Entropy.csv', 'r') as input_file:
    reader = csv.reader(input_file)
    with open('output.csv', 'w') as output_file:
        writer = csv.writer(output_file)

        for row in reader:
            if abs(float(row[1])) > E:
                writer.writerow([row[0]])


In [94]:
#preprocessing essential_proteins file

true_driver=pd.read_excel('essential_proteins.xlsx', header=None)

#true_driver

def remove_last_char(dataframe):
    for col in dataframe.columns:
        dataframe[col] = dataframe[col].str[:-1]
    return dataframe

remove_last_char(true_driver)

Unnamed: 0,0
0,YAL001C
1,YAL003W
2,YAL012W
3,YAL025C
4,YAL032C
...,...
1280,YPR182W
1281,YPR183W
1282,YPR186C
1283,YPR187W


In [95]:
#compare EnRenew algorithm results with essential_proteins file

df1 = pd.read_csv("output.csv", index_col=0, parse_dates=True)


df2 = pd.read_csv("true_driver.csv", index_col=0, parse_dates=True)

common_elements = df1[df1.isin(df2).all(1)]


In [96]:
# Correctly predicted proteins
print(common_elements)

Empty DataFrame
Columns: []
Index: [YCL018W, YBR072W, YBR127C, YBR020W, YBR196C, YBL099W, YBR118W, YBR160W, YBL007C, YJR045C, YER177W, YDL126C, YBR017C, YBR055C, YBR106W, YBR159W, YBL026W, YAL021C, YAR007C, YBL039C, YDR148C, YBL030C, YBL075C, YCR034W, YBL085W, YDL055C, YER022W, YAL015C, YBR036C, YAR014C, YGR103W, YBR069C, YDR099W, YBR274W, YDR502C, YDL140C, YDR225W, YFL018C, YBL056W, YCL037C, YHR052W, YJL141C, YDL143W, YGL019W, YBR018C, YBL049W, YFL016C, YDR155C, YDR004W, YAR019C, YER178W, YBL045C, YER006W, YBL004W, YKL152C, YDR386W, YBL002W, YAL036C, YDL014W, YBR109C, YBR234C, YBR011C, YER114C, YER091C, YAL007C, YAL016W, YBR193C, YDL147W, YBR155W, YAR002W, YAL041W, YBR142W, YBL016W, YDR365C, YDL212W, YJL199C, YJL012C, YFR034C, YOR187W, YKR093W, YDL078C, YER003C, YOR120W, YPR104C, YPR003C, YGR136W, YGL027C, YDR440W, YPR106W, YIL084C, YML046W, YDR212W, YNL093W, YML055W, YOL090W, YER054C, YGR158C, YGL105W, YMR118C, YKL120W, ...]

[1258 rows x 0 columns]
