In [7]:
import pandas as pd

from analytics.network_generation import generate_comention_network
from analytics.data_preparation import (mentions_to_occurrence,
                                        is_experiment_related,
                                        dummy_clean_up,
                                        is_not_single_letter)
from analytics.centrality import (compute_degree_centrality,
                                  compute_pagerank_centrality,
                                  compute_betweenness_centrality)

## 1. Read mentions data

In [8]:
# Load 10000 lines of the mention data sample
mentions = pd.read_csv("data/mention_data_sample.csv", nrows=10000)

# Extract unique paper/seciton/paragraph identifiers
mentions["paper"] = mentions["paper_id"].apply(
    lambda x: x.split(":")[0])
mentions["section"] = mentions["paper_id"].apply(
    lambda x: ":".join([x.split(":")[0], x.split(":")[1]]))

mentions = mentions.rename(columns={"paper_id": "paragraph"})

In [9]:
mentions

Unnamed: 0,entity,paragraph,paper,section
0,tract,1:Introduction:2,1,1:Introduction
1,nasopharyngeal,1:Data Collection ::: Methods:4,1,1:Data Collection
2,tract,1:Data Collection ::: Methods:4,1,1:Data Collection
3,lung,1:Results:10,1,1:Results
4,heart,1:Results:10,1,1:Results
...,...,...,...,...
9995,A549 cells,19:Caption:44,19,19:Caption
9996,hNTH,19:Caption:44,19,19:Caption
9997,A549 cells,19:Caption:44,19,19:Caption
9998,hNTH,19:Caption:44,19,19:Caption


## 2. Transform mentions into occurrences

In [10]:
occurrence_data, counts = mentions_to_occurrence(
    mentions,
    term_column="entity",
    factor_columns=["paper", "section", "paragraph"],
    term_cleanup=dummy_clean_up,
    term_filter=is_not_single_letter,
    mention_filter=lambda data: ~data["section"].apply(is_experiment_related),
    dump_prefix="data/example_")

Cleaning up the entities...
Aggregating occurrences of entities....
Saving the occurrence data....


In [11]:
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 nucleotide,{6:The Discontinuous Step In Nidovirus Sg Rna ...,{6},{6:The Discontinuous Step In Nidovirus Sg Rna ...
129,{19:Background:4},{19},{19:Background}
"10/100,000",{8:Acute Respiratory Distress Syndrome:14},{8},{8:Acute Respiratory Distress Syndrome}
104,{9:Pulmonary Vascular Disease ::: Review:23},{9},{9:Pulmonary Vascular Disease }
11c,{14:A10L And L4R Associate With Microtubules I...,{14},{14:A10L And L4R Associate With Microtubules I...
...,...,...,...
–208,{5:Syncytia Formation:17},{5},{5:Syncytia Formation}
–met,{11:In Vitro Transcription And Translation :::...,{11},{11:In Vitro Transcription And Translation }
†in,{7:Caption:27},{7},{7:Caption}
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [12]:
# Filter entities that occur only once (only in one paragraph, usually represent noisy terms)
occurrence_data = occurrence_data[occurrence_data["paragraph"].apply(lambda x: len(x) > 1)]
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1620,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Injurious Properties Of No•, 2:Protein Tyro..."
2-oxoglutarate,"{20:Background:2, 20:The Viral Alkb Domains Ar...",{20},"{20:Background, 20:The Viral Alkb Domains Are ..."
2c cells,"{25:Caption:26, 25:Rna Isolation And Different...",{25},"{25:Discussion, 25:Caption, 25:Rna Isolation A..."
3-methylcytosine,"{20:Background:2, 20:Abstract:1, 20:The Alkb D...",{20},"{20:Background, 20:Abstract, 20:The Alkb Domai..."
3-nitrotyrosine,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Injurious Properties Of No•, 2:Protein Tyro..."
...,...,...,...
β-galactosidase,{11:Identification Of Antizyme Genes In Nemato...,{11},{11:Translational Frameshifting During Express...
γ-tubulin,{14:Vaccinia Virus Infection Disrupts Centroso...,{14},{14:Vaccinia Virus Infection Disrupts Centroso...
–20,{12:Immunofluorescence Staining ::: Materials ...,"{14, 12}","{12:Immunofluorescence Staining , 14:Antibodie..."
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [13]:
counts

{'paper': 41, 'section': 465, 'paragraph': 1045}

## 3. Generate co-occurrence networks

In [14]:
# Limit to 1000 edges
paper_comention_network_1000_edges = generate_comention_network(
    occurrence_data, counts["paper"],
    factor_column="paper",
#     n_most_frequent=100,
    limit=1000,
    parallelize=False)

Examining 400960 pairs of terms for co-occurrence...


Processed 1 (0%) pairs     [1KProcessed 2 (0%) pairs     [1KProcessed 3 (0%) pairs     [1KProcessed 4 (0%) pairs     [1KProcessed 5 (0%) pairs     [1KProcessed 6 (0%) pairs     [1KProcessed 7 (0%) pairs     [1KProcessed 8 (0%) pairs     [1KProcessed 9 (0%) pairs     [1KProcessed 10 (0%) pairs     [1KProcessed 11 (0%) pairs     [1KProcessed 12 (0%) pairs     [1KProcessed 13 (0%) pairs     [1KProcessed 14 (0%) pairs     [1KProcessed 15 (0%) pairs     [1KProcessed 16 (0%) pairs     [1KProcessed 17 (0%) pairs     [1KProcessed 18 (0%) pairs     [1KProcessed 19 (0%) pairs     [1KProcessed 20 (0%) pairs     [1KProcessed 21 (0%) pairs     [1KProcessed 22 (0%) pairs     [1KProcessed 23 (0%) pairs     [1KProcessed 24 (0%) pairs     [1KProcessed 25 (0%) pairs     [1KProcessed 26 (0%) pairs     [1KProcessed 27 (0%) pairs     [1KProcessed 28 (0%) pairs     [1KProcessed 29 (0%) pairs     [1KProcessed 30 (0%) pairs     [1KProcessed 31 (0%) p

Processed 723 (0%) pairs     [1KProcessed 724 (0%) pairs     [1KProcessed 725 (0%) pairs     [1KProcessed 726 (0%) pairs     [1KProcessed 727 (0%) pairs     [1KProcessed 728 (0%) pairs     [1KProcessed 729 (0%) pairs     [1KProcessed 730 (0%) pairs     [1KProcessed 731 (0%) pairs     [1KProcessed 732 (0%) pairs     [1KProcessed 733 (0%) pairs     [1KProcessed 734 (0%) pairs     [1KProcessed 735 (0%) pairs     [1KProcessed 736 (0%) pairs     [1KProcessed 737 (0%) pairs     [1KProcessed 738 (0%) pairs     [1KProcessed 739 (0%) pairs     [1KProcessed 740 (0%) pairs     [1KProcessed 741 (0%) pairs     [1KProcessed 742 (0%) pairs     [1KProcessed 743 (0%) pairs     [1KProcessed 744 (0%) pairs     [1KProcessed 745 (0%) pairs     [1KProcessed 746 (0%) pairs     [1KProcessed 747 (0%) pairs     [1KProcessed 748 (0%) pairs     [1KProcessed 749 (0%) pairs     [1KProcessed 750 (0%) pairs     [1KProcessed 751 (0%) pairs     [1KProcessed 752 

Processed 1048 (0%) pairs     [1KProcessed 1049 (0%) pairs     [1KProcessed 1050 (0%) pairs     [1KProcessed 1051 (0%) pairs     [1KProcessed 1052 (0%) pairs     [1KProcessed 1053 (0%) pairs     [1KProcessed 1054 (0%) pairs     [1KProcessed 1055 (0%) pairs     [1KProcessed 1056 (0%) pairs     [1KProcessed 1057 (0%) pairs     [1KProcessed 1058 (0%) pairs     [1KProcessed 1059 (0%) pairs     [1KProcessed 1060 (0%) pairs     [1KProcessed 1061 (0%) pairs     [1KProcessed 1062 (0%) pairs     [1KProcessed 1063 (0%) pairs     [1KProcessed 1064 (0%) pairs     [1KProcessed 1065 (0%) pairs     [1KProcessed 1066 (0%) pairs     [1KProcessed 1067 (0%) pairs     [1KProcessed 1068 (0%) pairs     [1KProcessed 1069 (0%) pairs     [1KProcessed 1070 (0%) pairs     [1KProcessed 1071 (0%) pairs     [1KProcessed 1072 (0%) pairs     [1KProcessed 1073 (0%) pairs     [1KProcessed 1074 (0%) pairs     [1KProcessed 1075 (0%) pairs     [1KProcessed 1076 (0%) 

Reached the edge limit (1000)  [1K
Generated 1000 edges                    
Created a co-occurrence graph:
	number of nodes:  896
	number of edges:  1000
Saving the edges...
Creating a graph object...


In [15]:
# Use only 100 most frequent entities
paper_comention_network_100_most_frequent = generate_comention_network(
    occurrence_data, counts["paper"],
    factor_column="paper",
    n_most_frequent=100,
#     limit=1000,
    parallelize=False)

Fitering data.....
Selected 100 most frequent terms
Examining 4950 pairs of terms for co-occurrence...
Generated 3421 edges                    
Created a co-occurrence graph:
	number of nodes:  100
	number of edges:  3421
Saving the edges...
Creating a graph object...


In [16]:
# # Use only 100 most frequent entities
# paper_comention_network = generate_comention_network(
#     occurrence_data, counts["paper"],
#     factor_column="paper",
# #     n_most_frequent=100,
# #     limit=1000,
#     parallelize=False)

## 4. Compute centralities

We compute the degree and PageRank centralities only for the raw frequency

In [17]:
weights = ["frequency"]

In [18]:
degree_centrality = compute_degree_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (420)
	virus (353)
	humans (312)
	viral (284)
	lung (283)
	infection (282)
	dna (266)
	ards (256)
	bacterial (256)
	animal (255)



In [19]:
pagerank_centrality = compute_pagerank_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (0.03)
	virus (0.02)
	humans (0.02)
	infection (0.02)
	viral (0.02)
	lung (0.02)
	dna (0.02)
	bacterial (0.02)
	animal (0.02)
	animals (0.02)



We then compute the betweenness centrality based on the PPMI and NPMI distances

In [24]:
betweenness_centrality = compute_betweenness_centrality(
    paper_comention_network_100_most_frequent, ["distance_ppmi", "distance_npmi"], 10)

Top n nodes by distance_ppmi:
	cell lines (0.06547447261732976)
	alveolar macrophages (0.047124549675570074)
	amino acid sequence (0.04497354497354497)
	co2 (0.04057926200783344)
	antioxidants (0.033897036448056854)
	aids (0.021713735999450284)
	epithelial cells (0.02170023853697323)
	human disease (0.021438878581735724)
	asthmatic (0.021208930902808454)
	allergic (0.01941180512609084)

Top n nodes by distance_npmi:
	transcription factors (0.03150553150553151)
	mouse (0.0282415996701711)
	amino acids (0.027004741290455575)
	heart (0.025613275613275612)
	amino acid (0.024324881467738608)
	human disease (0.023809523809523808)
	epithelial cells (0.023088023088023088)
	cell lines (0.02198859341716484)
	tumor (0.020476877619734764)
	infected cells (0.02027760599189171)



In [None]:
# print("Detecting communities....")
# detect_communities(graph, weight="frequency", set_attr="community")
# detect_communities(graph, weight="ppmi", set_attr="community_ppmi")
# detect_communities(graph, weight="npmi", set_attr="community_npmi")

# print("Saving the nodes...")
# save_nodes(graph, os.path.join(dump_path, "{}_{}_node_list.pkl".format(factor, N)))

# print("Computing spanning trees...")
# tree_ppmi = nx.minimum_spanning_tree(graph, weight="distance_ppmi")
# tree_npmi = nx.minimum_spanning_tree(graph, weight="distance_npmi")

# print("Saving spanning trees...")
# edges = tree_ppmi.edges()
# save_edges(
#     tree_ppmi, os.path.join(dump_path, "{}_{}_ppmi_spanning_tree.pkl".format(factor, N)))
# save_edges(
#     tree_npmi, os.path.join(dump_path, "{}_{}_npmi_spanning_tree.pkl".format(factor, N)))