In [1]:
import random
import pandas as pd
import networkx as nx
from itertools import combinations
from networkx.algorithms import community
import networkx.algorithms.community as nx_comm
from pyfiles.mod_topic_influence_base import *

In [2]:
#%store -r group_publication_titles
%store -r topic_with_value
%store -r filtered_groups
%store -r group_head_id
# %store -r topic_rank
# %store -r norm_distance_kl
# %store -r norm_distance_tau
#%store -r mgp_nodes

In [3]:
%store -r lda_model
%store -r id2word
%store -r prepare_text_for_lda

In [4]:
lda_model.num_topics

25

In [29]:
def get_top_group_topic(topics):
    top=5
    #yearwies top three topic selection
    top_topic_interval = [[t[0] for t in topic_interval if t[1] > 0.10] for topic_interval in topics 
                          if any((t[1] for t in topic_interval))]
    #print(top_topic_interval[0])
    #flatten
    topics          = [topic for topic_interval in top_topic_interval for topic in topic_interval]
    
    #frequency calculation
    topic_freq      = Counter(topics)
    
    #sorted from maximum to minimim based on topic frequency
    sort_topic_freq = sorted(topic_freq.items(), key= lambda x: x[1], reverse=True)
    #print(sort_topic_freq)
    
    #selected top five topic with maximum frequency
    select_top      = [topic[0] for topic in sort_topic_freq[:top]] ### top=15
    return select_top

In [20]:
#len(topic_with_value[1])

In [23]:
#topic_with_value[-7]

In [27]:
#group_head_id[100]

In [28]:
#get_top_group_topic(topic_with_value[100])

In [30]:
def modified_jaccard_index1(topic_orders):
    topic_order1 = topic_orders[0]
    topic_order2 = topic_orders[1]
    #assert len(topic_order1) == len(topic_order2) > 1
    size1 =  len(topic_order1)
    size2 =  len(topic_order2)
    modified_order1 = [inx for i, inx in enumerate(topic_order1) for j in range(size1-(i))]
    modified_order2 = [inx for i, inx in enumerate(topic_order2) for j in range(size2-(i))]
    #print(modified_order1)
    #print(modified_order2)
    common_elements   = list(set(modified_order1).intersection(modified_order2))
    distinct_elements = list(set(modified_order1).union(modified_order2))
    counter1 = Counter(modified_order1)
    counter2 = Counter(modified_order2)
    common_element_count = sum(min(counter1[elem], counter2[elem]) for elem in common_elements)
    #print(common_element_count)
    total_element_count =  sum(max(counter1[elem], counter2[elem]) for elem in distinct_elements)
    #print(total_element_count)
    mod_jaccard_index =  common_element_count/total_element_count
    return mod_jaccard_index

In [31]:
#co-topic occurance graph
def construct_co_topic_graph1(group_top_topics):
    edges = []
    for group1, group2 in combinations(group_top_topics,2):
        topic_intersection = set(group_top_topics[group1]).intersection(group_top_topics[group2])
        weight = len(topic_intersection)
        if weight > 4:
            edges.append((group1, group2))#(group1, group2, {"weight": weight})
    return edges   

In [32]:
#co-topic occurance graph
def construct_co_topic_graph2(group_top_topics, threshold = 0.3):
    edges = []
    for group1, group2 in combinations(group_top_topics,2):
        jaccard_sim = modified_jaccard_index(group_top_topics[group1],group_top_topics[group2]) 
        if jaccard_sim < threshold:
            edges.append((group1, group2,{"weight": jaccard_sim}))#(group1, group2, {"weight": weight})
    return edges    

In [33]:
def construct_co_topic_graph3(group_top_topics, threshold = 0.7):
    group_pairs = list(combinations(group_top_topics, 2))
    topic_pairs = [(group_top_topics[group1],group_top_topics[group2]) for group1, group2 in group_pairs]
    jaccard_coef = map(modified_jaccard_index1, topic_pairs)
    edges = [(group[0], group[1], {"weight": coef}) for coef, group in zip(jaccard_coef, group_pairs) 
             if coef > threshold]
    return edges

In [37]:
group_top_topics1 = {idd:get_top_group_topic(group_topic) for idd, group_topic in zip(group_head_id, topic_with_value)}

In [36]:
#group_top_topics

In [38]:
%store group_top_topics1

Stored 'group_top_topics1' (dict)


In [128]:
#group_top_topics

In [129]:
#modified_jaccard_index1((group_top_topics[258],group_top_topics[33688]))

In [130]:
#modified_jaccard_index1((group_top_topics[258],group_top_topics[13666]))

In [131]:
# %%time
# edges1 = construct_co_topic_graph1(group_top_topics)

In [132]:
#len(edges1)

In [133]:
#edges1[1]

In [134]:
#group_top_topics[14373]

In [135]:
#group_top_topics[258]

In [137]:
# %%time
# edges2 = construct_co_topic_graph2(group_top_topics)

In [138]:
# count=[]

# for i in group_top_topics:
#     if len(group_top_topics[i])!=15:
#         print(i)
#         count.append(len(group_top_topics[i]))

In [142]:
#modified_jaccard_index1((group_top_topics[281], group_top_topics[285]))

In [39]:
%%time
edges3 = construct_co_topic_graph3(group_top_topics1, threshold=0.50)

CPU times: user 37min 8s, sys: 1min 20s, total: 38min 29s
Wall time: 38min 30s


In [40]:
#group_top_topics[281]

In [41]:
# %%time
# edges3_80 = construct_co_topic_graph3(group_top_topics, threshold=0.80)

In [42]:
len(edges3)

772236

In [51]:
edges3_80 = [edge for edge in edges3 if edge[2]["weight"] > 0.80]

In [52]:
len(edges3_80)

13912

In [53]:
G = nx.from_edgelist(edges3)

In [54]:
len(G.nodes)

15326

In [55]:
G.add_nodes_from(list(group_top_topics.keys()))

In [56]:
len(G.nodes)

15433

In [57]:
nx.write_gpickle(G, "topic_co_occurance_graph_80_1.gpickle")

In [58]:
nx.write_graphml_lxml(G, "topic_co_occurance_graph_80_1.graphml")

In [22]:
# %%time
# lou_community = nx_comm.louvain_communities(G, seed=123)

In [23]:
# communities_generator = community.girvan_newman(G)

In [24]:
# %%time
# top_level_communities = next(communities_generator)
# next_level_communities = next(communities_generator)
# community = sorted(map(sorted, next_level_communities))

In [25]:
# top_community = sorted(map(sorted, top_level_communities))

In [26]:
#list(combinations([1,2,3],2))

In [27]:
#intt = random.randint(0,399)

In [28]:
#lou_community[intt]

In [29]:
%%time
pr = nx.pagerank(G, alpha=0.85)

CPU times: user 5.3 s, sys: 392 ms, total: 5.69 s
Wall time: 5.78 s


In [30]:
%%time
bc = nx.betweenness_centrality(G)

CPU times: user 3h 20min 53s, sys: 2min 4s, total: 3h 22min 57s
Wall time: 3h 23min 22s


In [31]:
page_rank = sorted(pr.items(), key=lambda x:x[1], reverse=True)

In [32]:
%run read_researchers_metrics.ipynb

In [33]:
idd_pr_cite = [(k,v,mgpid2citation_count.get(k,None)) for k,v in page_rank]

In [34]:
idd_pr_cite[-10:]

[(88152, 1.3539761902616422e-05, 104.0),
 (162621, 1.3118564237214142e-05, 2.0),
 (174759, 1.292548179511198e-05, 1.0),
 (131248, 1.2823572562094066e-05, None),
 (54457, 1.2714974474790326e-05, None),
 (27213, 1.2476754138119969e-05, 14.0),
 (145545, 1.2167522613512705e-05, 17.0),
 (169327, 1.2032488739248145e-05, 89.0),
 (50755, 1.1855794954803403e-05, 1.0),
 (107620, 1.0183275084617625e-05, 21.0)]

In [35]:
idd_pr_cite[:10]

[(122825, 0.00016226253295364464, 36.0),
 (80020, 0.00016226253295364464, 143.0),
 (72099, 0.0001622625329536446, 434.0),
 (22289, 0.0001622625329536446, 1666.0),
 (83545, 0.0001622625329536446, 104.0),
 (38908, 0.0001622625329536446, 889.0),
 (77690, 0.0001622625329536446, 408.0),
 (79076, 0.0001622625329536446, 677.0),
 (149017, 0.00016121305766799847, 44.0),
 (197708, 0.00016121305766799847, 27.0)]

In [36]:
mgp_id = [idd for idd, val1, cite in idd_pr_cite]

In [37]:
pr_val = [val1 for idd, val1, cite in idd_pr_cite]

In [38]:
citation = [cite for idd, val1, cite in idd_pr_cite]

In [39]:
dataframe = pd.DataFrame({"MGP_id":mgp_id, "pr_val":pr_val, "citation":citation})

In [40]:
dataframe.head()

Unnamed: 0,MGP_id,pr_val,citation
0,122825,0.000162,36.0
1,80020,0.000162,143.0
2,72099,0.000162,434.0
3,22289,0.000162,1666.0
4,83545,0.000162,104.0


In [41]:
bc_sorted = sorted(bc.items(), key=lambda x:x[1], reverse=True)

In [42]:
bc_sorted[:10]

[(5761, 0.0030746955650474274),
 (44073, 0.0027302061006327077),
 (85212, 0.0021197052768331583),
 (79775, 0.001983097577255981),
 (1231, 0.0018804775071371807),
 (8231, 0.0016057044983000632),
 (105187, 0.001579124058140984),
 (1313, 0.0014892039974712897),
 (199981, 0.001453376785676139),
 (18577, 0.0013815394740237454)]

In [43]:
dataframe["bc"] = dataframe["MGP_id"].replace(bc)

In [44]:
dataframe[dataframe.columns[1:]].corr()

Unnamed: 0,pr_val,citation,bc
pr_val,1.0,0.051752,-0.025258
citation,0.051752,1.0,0.003979
bc,-0.025258,0.003979,1.0


In [34]:
# import numpy as np
# array= np.random.randint(0, 100, size=(3, 3, 2))

In [4]:
#4==3 > 1