In [1]:
import pandas as pd

from kganalytics.network_generation import generate_comention_network
from kganalytics.data_preparation import (mentions_to_occurrence,
                                        is_experiment_related,
                                        dummy_clean_up,
                                        is_not_single_letter)
from kganalytics.metrics import (compute_degree_centrality,
                               compute_pagerank_centrality,
                               compute_betweenness_centrality,
                               detect_communities,
                               compute_all_metrics)
from kganalytics.export import (save_nodes,
                              save_to_gephi)
from kganalytics.paths import (minimum_spanning_tree,
                             top_n_paths,
                             top_n_tripaths,
                             single_shortest_path,
                             top_n_nested_paths,
                             graph_from_paths)
from kganalytics.utils import subgraph_by_types

## 1. Read mentions data

In [2]:
# Load 10000 lines of the mention data sample
mentions = pd.read_csv("data/mention_data_sample.csv", nrows=10000)

# Extract unique paper/seciton/paragraph identifiers
mentions["paper"] = mentions["paper_id"].apply(
    lambda x: x.split(":")[0])
mentions["section"] = mentions["paper_id"].apply(
    lambda x: ":".join([x.split(":")[0], x.split(":")[1]]))

mentions = mentions.rename(columns={"paper_id": "paragraph"})

In [3]:
mentions

Unnamed: 0,entity,paragraph,paper,section
0,tract,1:Introduction:2,1,1:Introduction
1,nasopharyngeal,1:Data Collection ::: Methods:4,1,1:Data Collection
2,tract,1:Data Collection ::: Methods:4,1,1:Data Collection
3,lung,1:Results:10,1,1:Results
4,heart,1:Results:10,1,1:Results
...,...,...,...,...
9995,A549 cells,19:Caption:44,19,19:Caption
9996,hNTH,19:Caption:44,19,19:Caption
9997,A549 cells,19:Caption:44,19,19:Caption
9998,hNTH,19:Caption:44,19,19:Caption


## 2. Transform mentions into occurrences

In [4]:
occurrence_data, counts = mentions_to_occurrence(
    mentions,
    term_column="entity",
    factor_columns=["paper", "section", "paragraph"],
    term_cleanup=dummy_clean_up,
    term_filter=is_not_single_letter,
    mention_filter=lambda data: ~data["section"].apply(is_experiment_related),
    dump_prefix="data/example_")

Cleaning up the entities...
Aggregating occurrences of entities....
Saving the occurrence data....


In [5]:
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 nucleotide,{6:The Discontinuous Step In Nidovirus Sg Rna ...,{6},{6:The Discontinuous Step In Nidovirus Sg Rna ...
129,{19:Background:4},{19},{19:Background}
"10/100,000",{8:Acute Respiratory Distress Syndrome:14},{8},{8:Acute Respiratory Distress Syndrome}
104,{9:Pulmonary Vascular Disease ::: Review:23},{9},{9:Pulmonary Vascular Disease }
11c,{14:A10L And L4R Associate With Microtubules I...,{14},{14:A10L And L4R Associate With Microtubules I...
...,...,...,...
–208,{5:Syncytia Formation:17},{5},{5:Syncytia Formation}
–met,{11:In Vitro Transcription And Translation :::...,{11},{11:In Vitro Transcription And Translation }
†in,{7:Caption:27},{7},{7:Caption}
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},{12:The Mechanism Of Dominant-Negative Inhibit...


In [6]:
# Filter entities that occur only once (only in one paragraph, usually represent noisy terms)
occurrence_data = occurrence_data[occurrence_data["paragraph"].apply(lambda x: len(x) > 1)]
occurrence_data

Unnamed: 0_level_0,paragraph,paper,section
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1620,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Protein Tyrosine Nitration In The Lung, 2:I..."
2-oxoglutarate,"{20:Background:2, 20:The Viral Alkb Domains Ar...",{20},{20:The Viral Alkb Domains Are Most Likely Fun...
2c cells,"{25:Caption:26, 25:Introduction:4, 25:Rna Isol...",{25},"{25:Discussion, 25:Caption, 25:Introduction, 2..."
3-methylcytosine,"{20:Background:2, 20:Abstract:1, 20:The Alkb D...",{20},"{20:Background, 20:Abstract, 20:The Alkb Domai..."
3-nitrotyrosine,{2:Protein Tyrosine Nitration In The Lung: Doe...,{2},"{2:Protein Tyrosine Nitration In The Lung, 2:I..."
...,...,...,...
β-galactosidase,{11:Identification Of Antizyme Genes In Nemato...,{11},{11:Translational Frameshifting During Express...
γ-tubulin,{14:Vaccinia Virus Infection Disrupts Centroso...,{14},{14:Vaccinia Virus Infection Disrupts Centroso...
–20,{14:Antibodies And Immunofluorescence Microsco...,"{14, 12}","{12:Immunofluorescence Staining , 14:Antibodie..."
∼250 kda,"{12:Discussion:21, 12:The Mechanism Of Dominan...",{12},{12:The Mechanism Of Dominant-Negative Inhibit...


In [7]:
counts

{'paper': 41, 'section': 465, 'paragraph': 1045}

## 3. Generate co-occurrence networks

In [8]:
# Limit to 1000 edges
paper_comention_network_1000_edges = generate_comention_network(
    occurrence_data, "paper", counts["paper"],
#     n_most_frequent=100,
    limit=1000,
    parallelize=False)

Examining 400960 pairs of terms for co-occurrence...
Reached the edge limit (1000)
Generated 1000 edges                    
Created a co-occurrence graph:
	number of nodes:  896
	number of edges:  1000
Saving the edges...
Creating a graph object...


In [9]:
# Use only 100 most frequent entities
paper_comention_network_100_most_frequent = generate_comention_network(
    occurrence_data, "paper", counts["paper"],
    n_most_frequent=100,
#     limit=1000,
    parallelize=False)

Fitering data.....
Selected 100 most frequent terms
Examining 4950 pairs of terms for co-occurrence...
Generated 3421 edges                    
Created a co-occurrence graph:
	number of nodes:  100
	number of edges:  3421
Saving the edges...
Creating a graph object...


## 4. Compute centralities

We compute the degree and PageRank centralities only for the raw frequency

In [10]:
weights = ["frequency"]

In [11]:
degree_centrality = compute_degree_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (420)
	virus (353)
	humans (312)
	viral (284)
	lung (283)
	infection (282)
	dna (266)
	ards (256)
	bacterial (256)
	animal (255)



In [12]:
pagerank_centrality = compute_pagerank_centrality(paper_comention_network_100_most_frequent, weights, 10)

Top n nodes by frequency:
	human (0.03)
	virus (0.02)
	humans (0.02)
	infection (0.02)
	viral (0.02)
	lung (0.02)
	dna (0.02)
	bacterial (0.02)
	animal (0.02)
	animals (0.02)



We then compute the betweenness centrality based on the PPMI and NPMI distances

In [13]:
betweenness_centrality = compute_betweenness_centrality(
    paper_comention_network_100_most_frequent, ["distance_ppmi", "distance_npmi"], 20)

Top n nodes by distance_ppmi:
	cell lines (0.06547447261732976)
	alveolar macrophages (0.047124549675570074)
	amino acid sequence (0.04497354497354497)
	co2 (0.04057926200783344)
	antioxidants (0.033897036448056854)
	aids (0.021713735999450284)
	epithelial cells (0.02170023853697323)
	human disease (0.021438878581735724)
	asthmatic (0.021208930902808454)
	allergic (0.01941180512609084)
	heart (0.018793375936233075)
	bacterium (0.01742596028310314)
	amino acids (0.015838658695801554)
	transcription factors (0.01504844361987219)
	amino acid (0.014979729265443548)
	cystic fibrosis (0.01392324606610321)
	rats (0.013399299113584827)
	formaldehyde (0.013090084518655946)
	enzyme (0.012677798392084105)
	infected cells (0.012358276643990931)

Top n nodes by distance_npmi:
	transcription factors (0.03150553150553151)
	mouse (0.0282415996701711)
	amino acids (0.027004741290455575)
	heart (0.025613275613275612)
	amino acid (0.024324881467738608)
	human disease (0.023809523809523808)
	epithelial ce

## 5. Detect communities

In [14]:
_ = detect_communities(paper_comention_network_100_most_frequent, weight="frequency", set_attr="community")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="ppmi", set_attr="community_ppmi")
_ = detect_communities(paper_comention_network_100_most_frequent, weight="npmi", set_attr="community_npmi")

Detecting communities...
Best network partition:
	 Number of communities: 2
	 Modularity: 0.17062247895364224
Detecting communities...
Best network partition:
	 Number of communities: 3
	 Modularity: 0.22420884837314267
Detecting communities...
Best network partition:
	 Number of communities: 2
	 Modularity: 0.24539789862976222


## 6. Compute all the metrics in one go

In [15]:
paper_comention_network = generate_comention_network(
    occurrence_data,
    "paper",
    counts["paper"],
    parallelize=True,
    cores=4,
    dump_path="data/paper_comention_edge_list.pkl")

Examining 400960 pairs of terms for co-occurrence...
Generated 60451 edges                    
Created a co-occurrence graph:
	number of nodes:  896
	number of edges:  60451
Saving the edges...
Creating a graph object...


In [16]:
# We need to specify edge attributes used for computing different metrics
compute_all_metrics(
    paper_comention_network,
    degree_weights=["frequency"],
    pagerank_weights=["frequency"],
    betweenness_weights=["distance_ppmi", "distance_npmi"],
    community_weights=["frequency", "npmi"],
    print_summary=True)

Computing degree centrality statistics....
Top n nodes by frequency:
	human (1283)
	virus (1020)
	humans (988)
	lung (863)
	infection (815)
	viral (807)
	bacterial (799)
	animals (788)
	animal (778)
	mice (770)

Computing PageRank centrality statistics....
Top n nodes by frequency:
	human (0.01)
	virus (0.01)
	humans (0.01)
	infection (0.01)
	viral (0.01)
	lung (0.01)
	bacterial (0.01)
	animals (0.00)
	viruses (0.00)
	dna (0.00)

Computing betweenness centrality statistics....
Top n nodes by distance_ppmi:
	n-terminal domain (0.018638948311182076)
	cytomegalovirus (0.01862655016809527)
	xenopus (0.016615424797303664)
	saccharomyces cerevisiae (0.011687894959089255)
	protein (0.011593403477280212)
	cellular proteins (0.011071378106470653)
	adenovirus (0.010887969607320755)
	skin (0.010791125729895072)
	amino acid sequence (0.010761769860010468)
	fungal (0.010713479691799345)

Top n nodes by distance_npmi:
	cytomegalovirus (0.01815926073143525)
	n-terminal domain (0.01720396653977774)
	x

## 7. Export network and the computed metrics

In [17]:
# Save graph nodes as a pickled pandas.DataFrame
save_nodes(paper_comention_network, "data/paper_comention_node_list.pkl")

In [18]:
paper_comention_network.nodes[list(paper_comention_network.nodes())[0]]

{'degree_frequency': 51,
 'pagerank_frequency': 0.000527468579706385,
 'betweenness_distance_ppmi': 0.00011991201557337862,
 'betweenness_distance_npmi': 2.8822997043512598e-05,
 'community_frequency': 0,
 'community_npmi': 0}

In [19]:
# Save the graph for Gephi import.
save_to_gephi(
    paper_comention_network, "data/gephi_paper", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 8. Find spanning trees

In [20]:
tree = minimum_spanning_tree(paper_comention_network, weight="distance_npmi")

In [21]:
save_to_gephi(
    tree, "data/gephi_paper_spanning_tree", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community_npmi": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

## 9. Simple path search

In [22]:
try:
    paths, _ = top_n_paths(
        paper_comention_network, "virus", "sars-cov-2", n=10, distance="distance_npmi",
        strategy="naive", pretty_print=True)
except Exception as e:
    print(e)

Target sars-cov-2 cannot be reachedfrom given sources
No undirect paths from 'virus' to 'sars-cov-2' found


In [23]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10, distance="distance_npmi",
    strategy="naive", pretty_print=True)

virus <->                    <-> transcription factors
          viral
          cell lines
          cellular proteins
          leukemia virus
          infected cells
          16,20
          3-nitrotyrosine
          [3h]uridine
          anti-flag antibody
          c-terminal domain


In [24]:
paths, _ = top_n_paths(
    paper_comention_network, "virus", "transcription factors", n=10,
    distance="distance_npmi", strategy="yen", pretty_print=True)

virus <->                               <-> transcription factors
          viral
          cell lines
          cellular proteins
          leukemia virus
          viral <-> infected cells
          cell lines <-> leukemia virus
          infected cells
          16,20
          3-nitrotyrosine
          [3h]uridine


In [25]:
_ = single_shortest_path(tree, "virus", "transcription factors", pretty_print=True)

virus -> viral -> host -> bacteria -> bacterium -> compounds -> e coli -> cysteine -> 50 kda -> cysteine residues -> endotoxin -> airways -> neutrophils -> epithelial cells -> antioxidants -> cgmp -> 16,20 -> cellular proteins -> transcription factors


## 10. Conditional path search

In [26]:
path_a_b, path_b_c =  top_n_tripaths(
    paper_comention_network, "virus", "lung injury", "transcription factors", 10,
    strategy="yen", distance="distance_npmi", intersecting=False,
    pretty_print=True)

virus ->                 -> lung injury ->                          -> transcription factors
         lungs                               
                                             cystic fibrosis
         bacterial                           bronchopulmonary dysplasia
         neutrophils                         cgmp
         animals                             endotoxemia
         viral                               h2o2
         88,89                               human diseases
         acute lung injury                   lung diseases
         burns                               metabolites
         calcium                             metals


In [27]:
path_a_b

[['virus', 'lungs', 'lung injury'],
 ['virus', 'lung injury'],
 ['virus', 'bacterial', 'lung injury'],
 ['virus', 'neutrophils', 'lung injury'],
 ['virus', 'animals', 'lung injury'],
 ['virus', 'viral', 'lung injury'],
 ['virus', '88,89', 'lung injury'],
 ['virus', 'acute lung injury', 'lung injury'],
 ['virus', 'burns', 'lung injury'],
 ['virus', 'calcium', 'lung injury']]

In [28]:
path_b_c

[['lung injury', 'transcription factors'],
 ['lung injury', 'cystic fibrosis', 'transcription factors'],
 ['lung injury', 'bronchopulmonary dysplasia', 'transcription factors'],
 ['lung injury', 'cgmp', 'transcription factors'],
 ['lung injury', 'endotoxemia', 'transcription factors'],
 ['lung injury', 'h2o2', 'transcription factors'],
 ['lung injury', 'human diseases', 'transcription factors'],
 ['lung injury', 'lung diseases', 'transcription factors'],
 ['lung injury', 'metabolites', 'transcription factors'],
 ['lung injury', 'metals', 'transcription factors']]

## 11. Constructing path summary graphs

In [29]:
paths = top_n_nested_paths(
    paper_comention_network, "virus", "transcription factors",
    n=10, nested_n=1, depth=2, distance="distance_npmi",
    strategy="naive")

In [30]:
for p in paths:
    print(" <-> ".join(p))

virus <-> viral
viral <-> transcription factors
virus <-> cell lines
cell lines <-> transcription factors
virus <-> cellular proteins
cellular proteins <-> transcription factors
virus <-> leukemia virus
leukemia virus <-> transcription factors
virus <-> infected cells
infected cells <-> transcription factors
virus <-> 16,20
16,20 <-> transcription factors
virus <-> 3-nitrotyrosine
3-nitrotyrosine <-> transcription factors
virus <-> [3h]uridine
[3h]uridine <-> transcription factors
virus <-> anti-flag antibody
anti-flag antibody <-> transcription factors
virus <-> c-terminal domain
c-terminal domain <-> transcription factors


In [31]:
summary_graph = graph_from_paths(paths, source_graph=paper_comention_network)

In [32]:
print("Number of nodes: ", summary_graph.number_of_nodes())
print("Number of edges: ", summary_graph.number_of_edges())

Number of nodes:  12
Number of edges:  20


## 12. Finding subnetworks by entity types

In [33]:
import random

In [34]:
# We generate random entity types
types = ["A", "B", "C"]

In [35]:
types_data = {}
for n in paper_comention_network.nodes():
    n_types = random.choice([1, 2, 3])
    types_data[n] = set()
    for _ in range(n_types):
        types_data[n].add(random.choice(types))

In [36]:
a_b_subgraph = subgraph_by_types(
    paper_comention_network, types_data,
    types_to_include=["A", "B"],
    types_to_exclude=["C"],
    include_nodes=["transcription factors"])

In [37]:
a_b_subgraph.number_of_nodes()

411