In [1]:
import pandas as pd

from analytics.network_generation import generate_comention_network
from analytics.data_preparation import (mentions_to_occurrence,
                                        is_experiment_related,
                                        dummy_clean_up,
                                        is_not_single_letter)
from analytics.metrics import (compute_degree_centrality,
                               compute_pagerank_centrality,
                               compute_betweenness_centrality,
                               detect_communities,
                               compute_all_metrics)
from analytics.export import (save_nodes,
                              save_to_gephi)
from analytics.paths import (minimum_spanning_tree,
                             top_n_paths,
                             top_n_tripaths,
                             single_shortest_path)

## 1. Read mentions data

In [2]:
# Load 10000 lines of the mention data sample
mentions = pd.read_csv("data/mention_data_sample.csv", nrows=10000)

# Extract unique paper/seciton/paragraph identifiers
mentions["paper"] = mentions["paper_id"].apply(
    lambda x: x.split(":")[0])
mentions["section"] = mentions["paper_id"].apply(
    lambda x: ":".join([x.split(":")[0], x.split(":")[1]]))

mentions = mentions.rename(columns={"paper_id": "paragraph"})

In [3]:
mentions

Unnamed: 0,entity,paragraph,paper,section
0,tract,1:Introduction:2,1,1:Introduction
1,nasopharyngeal,1:Data Collection ::: Methods:4,1,1:Data Collection
2,tract,1:Data Collection ::: Methods:4,1,1:Data Collection
3,lung,1:Results:10,1,1:Results
4,heart,1:Results:10,1,1:Results
...,...,...,...,...
9995,A549 cells,19:Caption:44,19,19:Caption
9996,hNTH,19:Caption:44,19,19:Caption
9997,A549 cells,19:Caption:44,19,19:Caption
9998,hNTH,19:Caption:44,19,19:Caption


## 2. Transform mentions into occurrences

In [4]:
occurrence_data, counts = mentions_to_occurrence(
    mentions,
    term_column="entity",
    factor_columns=["paper", "section", "paragraph"],
    term_cleanup=dummy_clean_up,
    term_filter=is_not_single_letter,
    mention_filter=lambda data: ~data["section"].apply(is_experiment_related),
    dump_prefix="data/example_")

Cleaning up the entities...
Aggregating occurrences of entities....
Saving the occurrence data....


In [5]:
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 nucleotide,{6:The Discontinuous Step In Nidovirus Sg Rna ...,{6},{6:The Discontinuous Step In Nidovirus Sg Rna ...
129,{19:Background:4},{19},{19:Background}
"10/100,000",{8:Acute Respiratory Distress Syndrome:14},{8},{8:Acute Respiratory Distress Syndrome}
104,{9:Pulmonary Vascular Disease ::: Review:23},{9},{9:Pulmonary Vascular Disease }
11c,{14:A10L And L4R Associate With Microtubules I...,{14},{14:A10L And L4R Associate With Microtubules I...
...,...,...,...
–208,{5:Syncytia Formation:17},{5},{5:Syncytia Formation}
–met,{11:In Vitro Transcription And Translation :::...,{11},{11:In Vitro Transcription And Translation }
†in,{7:Caption:27},{7},{7:Caption}
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [6]:
# Filter entities that occur only once (only in one paragraph, usually represent noisy terms)
occurrence_data = occurrence_data[occurrence_data["paragraph"].apply(lambda x: len(x) > 1)]
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1620,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Injurious Properties Of No•, 2:Protein Tyro..."
2-oxoglutarate,"{20:Background:2, 20:The Viral Alkb Domains Ar...",{20},"{20:Background, 20:The Viral Alkb Domains Are ..."
2c cells,"{25:Caption:26, 25:Rna Isolation And Different...",{25},"{25:Discussion, 25:Caption, 25:Rna Isolation A..."
3-methylcytosine,{20:The Alkb Domain Probably Protects Virus Rn...,{20},"{20:Background, 20:Abstract, 20:The Alkb Domai..."
3-nitrotyrosine,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Injurious Properties Of No•, 2:Protein Tyro..."
...,...,...,...
β-galactosidase,{11:Translational Frameshifting During Express...,{11},{11:Identification Of Antizyme Genes In Nemato...
γ-tubulin,{14:Vaccinia Virus Infection Disrupts Centroso...,{14},{14:Vaccinia Virus Infection Disrupts Centroso...
–20,{12:Immunofluorescence Staining ::: Materials ...,"{12, 14}",{14:Antibodies And Immunofluorescence Microsco...
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},"{12:Discussion, 12:The Mechanism Of Dominant-N..."


In [7]:
counts

{'paper': 41, 'section': 465, 'paragraph': 1045}

## 3. Generate co-occurrence networks

In [8]:
# Limit to 1000 edges
paper_comention_network_1000_edges = generate_comention_network(
    occurrence_data, "paper", counts["paper"],
#     n_most_frequent=100,
    limit=1000,
    parallelize=False)

Examining 400960 pairs of terms for co-occurrence...


KeyboardInterrupt: 

In [None]:
# Use only 100 most frequent entities
paper_comention_network_100_most_frequent = generate_comention_network(
    occurrence_data, "paper", counts["paper"],
    n_most_frequent=100,
#     limit=1000,
    parallelize=False)

## 4. Compute centralities

We compute the degree and PageRank centralities only for the raw frequency

In [None]:
weights = ["frequency"]

In [None]:
degree_centrality = compute_degree_centrality(paper_comention_network_100_most_frequent, weights, 10)

In [None]:
pagerank_centrality = compute_pagerank_centrality(paper_comention_network_100_most_frequent, weights, 10)

We then compute the betweenness centrality based on the PPMI and NPMI distances

In [None]:
betweenness_centrality = compute_betweenness_centrality(
    paper_comention_network_100_most_frequent, ["distance_ppmi", "distance_npmi"], 20)

## 5. Detect communities

In [None]:
_ = detect_communities(paper_comention_network_100_most_frequent, weight="frequency", set_attr="community")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="ppmi", set_attr="community_ppmi")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="npmi", set_attr="community_npmi")

## 6. Compute all the metrics in one go

In [None]:
paper_comention_network = generate_comention_network(
    occurrence_data,
    "paper",
    counts["paper"],
    parallelize=True,
    cores=4,
    dump_path="data/paper_comention_edge_list.pkl")

In [None]:
# We need to specify edge attributes used for computing different metrics
compute_all_metrics(
    paper_comention_network,
    degree_weights=["frequency"],
    pagerank_weights=["frequency"],
    betweenness_weights=["distance_ppmi", "distance_npmi"],
    community_weights=["frequency", "npmi"],
    print_summary=True)

## 7. Export network and the computed metrics

In [None]:
# Save graph nodes as a pickled pandas.DataFrame
save_nodes(paper_comention_network, "data/paper_comention_node_list.pkl")

In [None]:
paper_comention_network.nodes[list(paper_comention_network.nodes())[0]]

In [None]:
# Save the graph for Gephi import.
save_to_gephi(
    paper_comention_network, "data/gephi_paper", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 8. Find spanning trees

In [None]:
tree = minimum_spanning_tree(paper_comention_network, weight="distance_npmi")

In [None]:
save_to_gephi(
    tree, "data/gephi_paper_spanning_tree", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 9. Simple path search

In [None]:
try:
    paths, _ = top_n_paths(
        paper_comention_network, "virus", "sars-cov-2", n=10, weight="npmi",
        strategy="naive", pretty_print=True)
except Exception as e:
    print(e)

In [None]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10, weight="npmi",
    strategy="naive", pretty_print=True)

In [None]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10,
    distance="distance_npmi",
    strategy="yen", pretty_print=True)

In [None]:
_ = single_shortest_path(tree, "virus", "transcription factors", pretty_print=True)

## 10. Conditional path search

In [None]:
path_a_b, path_b_c =  top_n_tripaths(
    paper_comention_network, "virus", "lung injury", "transcription factors", 10,
    strategy="yen", distance="distance_npmi", intersecting=False,
    pretty_print=True)

In [None]:
path_a_b

In [None]:
path_b_c