In [1]:
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator
import numpy as np
from torch_geometric.utils import degree

In [7]:
import sys
sys.path.insert(0, "./Neo4jToPyGGraph/")

In [8]:
import os
from impl.GraphRetriever import GraphRetriever


In [14]:
from tqdm.notebook import tqdm

In [9]:
os.getcwd()

'/Users/ammar.ateeq'

In [None]:
# So here you need to calculate the degree a bit different 
# Count occurrencces (either there is some numpy utility or you can use simple for loops) but you can directly update the node properties by adding new properties

In [10]:
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "password")

graphRetriever = GraphRetriever(URI, AUTH)

In [11]:
import numpy as np
import pandas as pd

class DegreeCalculator:
    def __init__(self):
        self.node_dict = dict()
        
    def set_degree_dict(self, unique_idx, count_idx, dict_type, tuple_names):
        source_type, edge_type, target_type = tuple_names 
        for i, node_id in enumerate(unique_idx):
            node_id_dict = self.node_dict.get(node_id, dict())
            self.node_dict[node_id] = node_id_dict
            count_node_id = count_idx[i]
            node_degree_dict = node_id_dict.get(dict_type, dict())
            self.node_dict[node_id][dict_type] = node_degree_dict
            node_degree_dict[f"{source_type}_{edge_type}_{target_type}"] = count_node_id
            
    def add_edge_index_to_node_dict(self, tuple_names, edge_index):
        source_index = edge_index[0]
        target_index = edge_index[1]
        unique_source_idx, count_source_index = np.unique(source_index, return_counts = True)
        unique_target_idx, count_target_index = np.unique(target_index, return_counts = True)
    
        self.set_degree_dict(unique_source_idx, count_source_index, "node_out_degree_dict", tuple_names)
        self.set_degree_dict(unique_target_idx, count_target_index, "node_in_degree_dict", tuple_names)

    def write_degree_summaries(self):
        for node_id in self.node_dict.keys():
            node_out_degree_dict = self.node_dict[node_id].get("node_out_degree_dict", dict())
            node_in_degree_dict = self.node_dict[node_id].get("node_in_degree_dict", dict())
            self.node_dict[node_id]["out_degree"] = sum(node_out_degree_dict.values())
            self.node_dict[node_id]["in_degree"] = sum(node_in_degree_dict.values())
            self.node_dict[node_id]["degree"] = self.node_dict[node_id]["in_degree"] + self.node_dict[node_id]["out_degree"]

    def calculate_node_dict(self, folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                full_file_path = os.path.join(folder_path, file_name)
                edge_index = pd.read_csv(full_file_path).values.transpose()
                tuple_names = file_name[:-4].split("__")
                self.add_edge_index_to_node_dict(tuple_names, edge_index)
                
        self.write_degree_summaries()
        

    def get_node_dict(self):
        return self.node_dict

In [None]:
"""USAGE:

##Basic usage
file_names = ["DRUG_ACTS_ON_PROTEIN.csv"]
degree_calculator = DegreeCalculator()
degree_calculator.calculate_node_dict(file_names)
node_dict = degree_calculator.get_node_dict()
node_dict

## Write to db
for key in node_dict:
    property_dict = node_dict[key]
    query = f'''MATCH (n)
        WHERE id(n) = {key}
        SET n.degree_dict = "{property_dict}"'''
    session.run(query)
"""

In [12]:
folder_path = 'data_with_orginal_id_all/'  

degree_calculator = DegreeCalculator()
degree_calculator.calculate_node_dict(folder_path)
node_dict = degree_calculator.get_node_dict()
node_dict

{415186: {'node_out_degree_dict': {'Protein_ASSOCIATED_WITH_Disease': 281,
   'Protein_HAS_SEQUENCE_Amino_acid_sequence': 1,
   'Protein_DETECTED_IN_PATHOLOGY_SAMPLE_Disease': 16,
   'Protein_ACTS_ON_Protein': 16,
   'Protein_ASSOCIATED_WITH_Molecular_function': 9,
   'Protein_HAS_MODIFIED_SITE_Modified_protein': 5,
   'Protein_ASSOCIATED_WITH_Cellular_component': 149,
   'Protein_ASSOCIATED_WITH_Biological_process': 17,
   'Protein_ASSOCIATED_WITH_Tissue': 275,
   'Protein_MENTIONED_IN_PUBLICATION_Publication': 223,
   'Protein_COMPILED_INTERACTS_WITH_Protein': 48},
  'node_in_degree_dict': {'Known_variant_VARIANT_FOUND_IN_PROTEIN_Protein': 465,
   'Protein_ACTS_ON_Protein': 12,
   'Drug_COMPILED_TARGETS_Protein': 21,
   'Drug_ACTS_ON_Protein': 34,
   'Protein_CURATED_INTERACTS_WITH_Protein': 1,
   'Transcript_TRANSLATED_INTO_Protein': 1,
   'Gene_TRANSLATED_INTO_Protein': 1,
   'Protein_COMPILED_INTERACTS_WITH_Protein': 48},
  'out_degree': 1040,
  'in_degree': 583,
  'degree': 1623}

In [17]:
node_dict[9]

{'node_in_degree_dict': {'Protein_ASSOCIATED_WITH_Disease': 8},
 'node_out_degree_dict': {'Disease_HAS_PARENT_Disease': 1},
 'out_degree': 1,
 'in_degree': 8,
 'degree': 9}

In [18]:
os.listdir(folder_path)

['Protein__ASSOCIATED_WITH__Disease.csv',
 'Clinical_variable__HAS_PARENT__Clinical_variable.csv',
 'Protein__HAS_SEQUENCE__Amino_acid_sequence.csv',
 'Known_variant__VARIANT_FOUND_IN_CHROMOSOME__Chromosome.csv',
 'Protein__DETECTED_IN_PATHOLOGY_SAMPLE__Disease.csv',
 'Transcript__LOCATED_IN__Chromosome.csv',
 'Metabolite__ASSOCIATED_WITH__Disease.csv',
 'Peptide__BELONGS_TO_PROTEIN__Protein.csv',
 'Drug__CURATED_TARGETS__Gene.csv',
 'Protein__ANNOTATED_IN_PATHWAY__Pathway.csv',
 'User__IS_RESPONSIBLE__Project.csv',
 'Known_variant__VARIANT_FOUND_IN_PROTEIN__Protein.csv',
 'Protein__ACTS_ON__Protein.csv',
 'Modification__HAS_PARENT__Modification.csv',
 '.DS_Store',
 'Biological_sample__SPLITTED_INTO__Analytical_sample.csv',
 'Clinically_relevant_variant__ASSOCIATED_WITH__Disease.csv',
 'Experimental_factor__MAPS_TO__Disease.csv',
 'Biological_sample__BELONGS_TO_SUBJECT__Subject.csv',
 'Project__STUDIES_TISSUE__Tissue.csv',
 'User__PARTICIPATES_IN__Project.csv',
 'Drug__COMPILED_TARGETS

In [16]:
## Write to db
with graphRetriever.driver.session() as session:
    for key in tqdm(node_dict):
        property_dict = node_dict[key]
        query = f'''MATCH (n)
            WHERE id(n) = {key} AND NOT exists(n.degree_dict)
            SET n.degree_dict = "{property_dict}"'''
        session.run(query)


  0%|          | 0/16445695 [00:00<?, ?it/s]

KeyboardInterrupt: 