In [1]:
import pandas as pd

from analytics.network_generation import generate_comention_network
from analytics.data_preparation import (mentions_to_occurrence,
                                        is_experiment_related,
                                        dummy_clean_up,
                                        is_not_single_letter)
from analytics.metrics import (compute_degree_centrality,
                               compute_pagerank_centrality,
                               compute_betweenness_centrality,
                               detect_communities,
                               compute_all_metrics)
from analytics.export import (save_nodes,
                              save_to_gephi)
from analytics.paths import (minimum_spanning_tree,
                             top_n_paths,
                             top_n_tripaths,
                             single_shortest_path)

## 1. Read mentions data

In [2]:
# Load 10000 lines of the mention data sample
mentions = pd.read_csv("data/mention_data_sample.csv", nrows=10000)

# Extract unique paper/seciton/paragraph identifiers
mentions["paper"] = mentions["paper_id"].apply(
    lambda x: x.split(":")[0])
mentions["section"] = mentions["paper_id"].apply(
    lambda x: ":".join([x.split(":")[0], x.split(":")[1]]))

mentions = mentions.rename(columns={"paper_id": "paragraph"})

In [3]:
mentions

Unnamed: 0,entity,paragraph,paper,section
0,tract,1:Introduction:2,1,1:Introduction
1,nasopharyngeal,1:Data Collection ::: Methods:4,1,1:Data Collection
2,tract,1:Data Collection ::: Methods:4,1,1:Data Collection
3,lung,1:Results:10,1,1:Results
4,heart,1:Results:10,1,1:Results
...,...,...,...,...
9995,A549 cells,19:Caption:44,19,19:Caption
9996,hNTH,19:Caption:44,19,19:Caption
9997,A549 cells,19:Caption:44,19,19:Caption
9998,hNTH,19:Caption:44,19,19:Caption


## 2. Transform mentions into occurrences

In [4]:
occurrence_data, counts = mentions_to_occurrence(
    mentions,
    term_column="entity",
    factor_columns=["paper", "section", "paragraph"],
    term_cleanup=dummy_clean_up,
    term_filter=is_not_single_letter,
    mention_filter=lambda data: ~data["section"].apply(is_experiment_related),
    dump_prefix="data/example_")

Cleaning up the entities...
Aggregating occurrences of entities....
Saving the occurrence data....


In [5]:
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 nucleotide,{6:The Discontinuous Step In Nidovirus Sg Rna ...,{6},{6:The Discontinuous Step In Nidovirus Sg Rna ...
129,{19:Background:4},{19},{19:Background}
"10/100,000",{8:Acute Respiratory Distress Syndrome:14},{8},{8:Acute Respiratory Distress Syndrome}
104,{9:Pulmonary Vascular Disease ::: Review:23},{9},{9:Pulmonary Vascular Disease }
11c,{14:A10L And L4R Associate With Microtubules I...,{14},{14:A10L And L4R Associate With Microtubules I...
...,...,...,...
–208,{5:Syncytia Formation:17},{5},{5:Syncytia Formation}
–met,{11:In Vitro Transcription And Translation :::...,{11},{11:In Vitro Transcription And Translation }
†in,{7:Caption:27},{7},{7:Caption}
∼250 kda,{12:The Mechanism Of Dominant-Negative Inhibit...,{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [6]:
# Filter entities that occur only once (only in one paragraph, usually represent noisy terms)
occurrence_data = occurrence_data[occurrence_data["paragraph"].apply(lambda x: len(x) > 1)]
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1620,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Protein Tyrosine Nitration In The Lung, 2:I..."
2-oxoglutarate,{20:The Viral Alkb Domains Are Most Likely Fun...,{20},"{20:Background, 20:The Viral Alkb Domains Are ..."
2c cells,{25:Rna Isolation And Differential Display Pcr...,{25},"{25:Introduction, 25:Rna Isolation And Differe..."
3-methylcytosine,"{20:Background:2, 20:The Alkb Domain Probably ...",{20},"{20:Background, 20:The Alkb Domain Probably Pr..."
3-nitrotyrosine,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Protein Tyrosine Nitration In The Lung, 2:I..."
...,...,...,...
β-galactosidase,{11:Translational Frameshifting During Express...,{11},{11:Identification Of Antizyme Genes In Nemato...
γ-tubulin,{14:Vaccinia Virus Infection Disrupts Centroso...,{14},{14:Vaccinia Virus Infection Disrupts Centroso...
–20,{12:Immunofluorescence Staining ::: Materials ...,"{14, 12}",{14:Antibodies And Immunofluorescence Microsco...
∼250 kda,{12:The Mechanism Of Dominant-Negative Inhibit...,{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [7]:
counts

{'paper': 41, 'section': 465, 'paragraph': 1045}

## 3. Generate co-occurrence networks

In [8]:
# Limit to 1000 edges
paper_comention_network_1000_edges = generate_comention_network(
    occurrence_data, counts["paper"],
    factor_column="paper",
#     n_most_frequent=100,
    limit=1000,
    parallelize=False)

Examining 400960 pairs of terms for co-occurrence...
Reached the edge limit (1000)  [1K
Generated 1000 edges                    
Created a co-occurrence graph:
	number of nodes:  896
	number of edges:  1000
Saving the edges...
Creating a graph object...


In [9]:
# Use only 100 most frequent entities
paper_comention_network_100_most_frequent = generate_comention_network(
    occurrence_data, counts["paper"],
    factor_column="paper",
    n_most_frequent=100,
#     limit=1000,
    parallelize=False)

Fitering data.....
Selected 100 most frequent terms
Examining 4950 pairs of terms for co-occurrence...
Generated 3421 edges                    
Created a co-occurrence graph:
	number of nodes:  100
	number of edges:  3421
Saving the edges...
Creating a graph object...


## 4. Compute centralities

We compute the degree and PageRank centralities only for the raw frequency

In [10]:
weights = ["frequency"]

In [11]:
degree_centrality = compute_degree_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (420)
	virus (353)
	humans (312)
	viral (284)
	lung (283)
	infection (282)
	dna (266)
	ards (256)
	bacterial (256)
	animal (255)



In [12]:
pagerank_centrality = compute_pagerank_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (0.03)
	virus (0.02)
	humans (0.02)
	infection (0.02)
	viral (0.02)
	lung (0.02)
	dna (0.02)
	bacterial (0.02)
	animal (0.02)
	animals (0.02)



We then compute the betweenness centrality based on the PPMI and NPMI distances

In [13]:
betweenness_centrality = compute_betweenness_centrality(
    paper_comention_network_100_most_frequent, ["distance_ppmi", "distance_npmi"], 20)

Top n nodes by distance_ppmi:
	cell lines (0.06547447261732976)
	alveolar macrophages (0.047124549675570074)
	amino acid sequence (0.04497354497354497)
	co2 (0.04057926200783344)
	antioxidants (0.033897036448056854)
	aids (0.021713735999450284)
	epithelial cells (0.02170023853697323)
	human disease (0.021438878581735724)
	asthmatic (0.021208930902808454)
	allergic (0.01941180512609084)
	heart (0.018793375936233075)
	bacterium (0.01742596028310314)
	amino acids (0.015838658695801554)
	transcription factors (0.01504844361987219)
	amino acid (0.014979729265443548)
	cystic fibrosis (0.01392324606610321)
	rats (0.013399299113584827)
	formaldehyde (0.013090084518655946)
	enzyme (0.012677798392084105)
	infected cells (0.012358276643990931)

Top n nodes by distance_npmi:
	transcription factors (0.03150553150553151)
	mouse (0.0282415996701711)
	amino acids (0.027004741290455575)
	heart (0.025613275613275612)
	amino acid (0.024324881467738608)
	human disease (0.023809523809523808)
	epithelial ce

## 5. Detect communities

In [14]:
_ = detect_communities(paper_comention_network_100_most_frequent, weight="frequency", set_attr="community")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="ppmi", set_attr="community_ppmi")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="npmi", set_attr="community_npmi")

Detecting communities...
Best network partition:
	 Number of communities: 2
	 Modularity: 0.1693805447207431
Detecting communities...
Best network partition:
	 Number of communities: 2
	 Modularity: 0.22869830233990954
Detecting communities...
Best network partition:
	 Number of communities: 3
	 Modularity: 0.24186849865223786


## 6. Compute all the metrics in one go

In [15]:
paper_comention_network = generate_comention_network(
    occurrence_data, counts["paper"],
    factor_column="paper",
    n_most_frequent=100,
#     limit=1000,
    parallelize=False,
    dump_path="data/paper_comention_edge_list.pkl")

Fitering data.....
Selected 100 most frequent terms
Examining 4950 pairs of terms for co-occurrence...
Generated 3421 edges                    
Created a co-occurrence graph:
	number of nodes:  100
	number of edges:  3421
Saving the edges...
Creating a graph object...


In [16]:
# We need to specify edge attributes used for computing different metrics
compute_all_metrics(
    paper_comention_network,
    degree_weights=["frequency"],
    pagerank_weights=["frequency"],
    betweenness_weights=["distance_ppmi", "distance_npmi"],
    community_weights=["frequency", "npmi"],
    print_summary=True)

Computing degree centrality statistics....
Top n nodes by frequency:
	human (420)
	virus (353)
	humans (312)
	viral (284)
	lung (283)
	infection (282)
	dna (266)
	ards (256)
	bacterial (256)
	animal (255)

Computing PageRank centrality statistics....
Top n nodes by frequency:
	human (0.03)
	virus (0.02)
	humans (0.02)
	infection (0.02)
	viral (0.02)
	lung (0.02)
	dna (0.02)
	bacterial (0.02)
	animal (0.02)
	animals (0.02)

Computing betweenness centrality statistics....
Top n nodes by distance_ppmi:
	cell lines (0.06547447261732976)
	alveolar macrophages (0.047124549675570074)
	amino acid sequence (0.04497354497354497)
	co2 (0.04057926200783344)
	antioxidants (0.033897036448056854)
	aids (0.021713735999450284)
	epithelial cells (0.02170023853697323)
	human disease (0.021438878581735724)
	asthmatic (0.021208930902808454)
	allergic (0.01941180512609084)

Top n nodes by distance_npmi:
	transcription factors (0.03150553150553151)
	mouse (0.0282415996701711)
	amino acids (0.0270047412904555

## 7. Export network and the computed metrics

In [17]:
# Save graph nodes as a pickled pandas.DataFrame
save_nodes(paper_comention_network, "data/paper_comention_node_list.pkl")

In [18]:
paper_comention_network.nodes[list(paper_comention_network.nodes())[0]]

{'degree_frequency': 244,
 'pagerank_frequency': 0.015405249811721203,
 'betweenness_distance_ppmi': 0.0,
 'betweenness_distance_npmi': 0.002886002886002886,
 'community_frequency': 0,
 'community_npmi': 0}

In [19]:
# Save the graph for Gephi import.
save_to_gephi(
    paper_comention_network, "data/gephi_paper", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 8. Find spanning trees

In [20]:
tree = minimum_spanning_tree(paper_comention_network, weight="distance_npmi")

In [21]:
save_to_gephi(
    tree, "data/gephi_paper_spanning_tree", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 9. Simple path search

In [22]:
try:
    paths, _ = top_n_paths(
        paper_comention_network, "virus", "sars-cov-2", n=10, weight="npmi",
        strategy="naive", pretty_print=True)
except Exception as e:
    print(e)

Target sars-cov-2 cannot be reachedfrom given sources
No undirect paths from 'virus' to 'sars-cov-2' found


In [23]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10, weight="npmi",
    strategy="naive", pretty_print=True)

virus <->                  <-> transcription factors
          viral
          infected cells
          cell lines
          cystic fibrosis
          lung injury
          antioxidants
          asthmatic
          co2
          airways
          epithelial cells


In [24]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10,
    distance="distance_npmi",
    strategy="yen", pretty_print=True)

virus <->                                 <-> transcription factors
          viral
          cell lines
          viral <-> infected cells
          infected cells
          viral <-> cell lines
          viral <-> co2
          neutrophils
          cell lines <-> infected cells
          mosaic virus <-> infected cells
          bacterial <-> enzyme


In [25]:
_ = single_shortest_path(tree, "virus", "transcription factors", pretty_print=True)

virus -> viral -> host -> bacteria -> bacterium -> fungi -> aids -> human disease -> airways -> asthmatic -> cystic fibrosis -> transcription factors


## 10. Conditional path search

In [26]:
path_a_b, path_b_c =  top_n_tripaths(
    paper_comention_network, "virus", "lung injury", "transcription factors", 10,
    strategy="yen", distance="distance_npmi", intersecting=False,
    pretty_print=True)

virus ->                  -> lung injury ->                -> transcription factors
         lungs                                
                                              cystic fibrosis
         bacterial                            asthmatic
         neutrophils                          asthma
         animals                              enzyme
         viral                                bronchiectasis
         mice                                 epithelial cells
         airways                              antioxidants
         influenza                            co2
         animals -> airways                   rat


In [27]:
path_a_b

[['virus', 'lungs', 'lung injury'],
 ['virus', 'lung injury'],
 ['virus', 'bacterial', 'lung injury'],
 ['virus', 'neutrophils', 'lung injury'],
 ['virus', 'animals', 'lung injury'],
 ['virus', 'viral', 'lung injury'],
 ['virus', 'mice', 'lung injury'],
 ['virus', 'airways', 'lung injury'],
 ['virus', 'influenza', 'lung injury'],
 ['virus', 'animals', 'airways', 'lung injury']]

In [28]:
path_b_c

[['lung injury', 'transcription factors'],
 ['lung injury', 'cystic fibrosis', 'transcription factors'],
 ['lung injury', 'asthmatic', 'transcription factors'],
 ['lung injury', 'asthma', 'transcription factors'],
 ['lung injury', 'enzyme', 'transcription factors'],
 ['lung injury', 'bronchiectasis', 'transcription factors'],
 ['lung injury', 'epithelial cells', 'transcription factors'],
 ['lung injury', 'antioxidants', 'transcription factors'],
 ['lung injury', 'co2', 'transcription factors'],
 ['lung injury', 'rat', 'transcription factors']]