In [1]:
import os
import requests
import pandas as pd
import networkx as nx
from scipy.stats import entropy

In [2]:
def load_graph(path: str) -> nx.Graph:
    """
    Load a GraphML network.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Graph file not found: {path}")
    
    G = nx.read_graphml(path)
        # Extract nodes with attributes
    df_nodes = pd.DataFrame([{"id": n[0], **n[1]} for n in G.nodes(data=True)])
    
    # Extract edges with attributes
    df_edges = pd.DataFrame([{"source": u, "target": v, **d} 
                           for u, v, d in G.edges(data=True)])
    
     # Change column names
    df_nodes = df_nodes.rename(columns={'ATTRIBUTE_Bacteria': 'ATTRIBUTE_Genus', 'ATTRIBUTE_Strain': 'ATTRIBUTE_Species'})

    return G, df_nodes, df_edges

def load_metadata(path: str) -> pd.DataFrame:
    """
    Load species metadata CSV.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Metadata file not found: {path}")
    df = pd.read_csv(path, sep=";", encoding='utf-8')

    # Change column names
    df = df.rename(columns={'ATTRIBUTE_Bacteria': 'ATTRIBUTE_Genus', 'ATTRIBUTE_Strain': 'ATTRIBUTE_Species'})
    return df


In [3]:
### Weighing
Weigh_Ans = 0.33
Weigh_Hcos = 0.33
Weigh_Hmqs = 0.33

In [4]:
### Get node and edge data from the graph
G, df_nodes, df_edges = load_graph("Actino_network.graphml")
metadata = load_metadata("Metadata_Actinomyces.csv")

In [6]:
### Calculate total nodes per sample
df_nodes['total_nodes'] = df_nodes.groupby('ATTRIBUTE_Species')['id'].transform('count')
# Count total nodes per Species
species_counts = df_nodes['ATTRIBUTE_Species'].value_counts().reset_index()

In [7]:
species_counts

Unnamed: 0,ATTRIBUTE_Species,count
0,"Control,1,2,171",679
1,12,349
2,171,282
3,2,228
4,Control,147
5,1,71
6,"Control,171",43
7,"Control,1,2",41
8,"Control,1",28
9,12171,21


In [8]:
df_nodes.columns

Index(['id', 'charge', 'number of spectra', 'DefaultGroups', 'G4',
       'MS2 Verification Comment', 'GNPSGROUP:Upala', 'AllGroups',
       'GNPSGROUP:1', 'G2', 'G1', 'neutral M mass', 'RTMean', 'componentindex',
       'cluster index', 'GNPSGROUP:Control', 'G6', 'UniqueFileSources',
       'parent mass', 'G3', 'Best Ion', 'ATTRIBUTE_Species', 'GNPSGROUP:2',
       'NODE_TYPE', 'GNPSLinkout_Cluster', 'Correlated Features Group ID',
       'sum(precursor intensity)', 'GNPSLinkout_Network', 'RTConsensus',
       'GNPSGROUP:Rhodococcus', 'GNPSGROUP:171', 'G5', 'ATTRIBUTE_Genus',
       'Annotated Adduct Features ID', 'precursor mass', 'Analog:Smiles',
       'Analog:GNPSLibraryURL', 'MassDiff', 'Analog:tags',
       'Analog:Library_Class', 'IonMode', 'Analog:MassDiff', 'MZErrorPPM',
       'IIN Best Ion=Library Adduct', 'Analog:Compound_Source',
       'Library_Class', 'INCHI', 'Analog:SharedPeaks',
       'Analog:IIN Best Ion=Library Adduct', 'Compound_Name', 'Ion_Source',
       'Analo

In [9]:
df_edges.columns


Index(['source', 'target', 'property1', 'component', 'node1', 'node2',
       'explained_intensity', 'EdgeScore', 'cosine_score', 'EdgeAnnotation',
       'mass_difference', 'EdgeType', 'id'],
      dtype='object')

In [11]:
metadata.columns

Index(['filename', 'ATTRIBUTE_Genus', 'ATTRIBUTE_Species'], dtype='object')

metadata.columns --> 'filename', 'ATTRIBUTE_Genus', 'ATTRIBUTE_Species'
df_edges.columns --> 'source', 'target', 'property1', 'component', 'node1', 'node2',
       'explained_intensity', 'EdgeScore', 'cosine_score', 'EdgeAnnotation',
       'mass_difference', 'EdgeType', 'id'
df_nodes.columns --> 'id', 'charge', 'number of spectra', 'DefaultGroups', 'G4',
       'MS2 Verification Comment', 'GNPSGROUP:Upala', 'AllGroups',
       'GNPSGROUP:1', 'G2', 'G1', 'neutral M mass', 'RTMean', 'componentindex',
       'cluster index', 'GNPSGROUP:Control', 'G6', 'UniqueFileSources',
       'parent mass', 'G3', 'Best Ion', 'ATTRIBUTE_Species', 'GNPSGROUP:2',
       'NODE_TYPE', 'GNPSLinkout_Cluster', 'Correlated Features Group ID',
       'sum(precursor intensity)', 'GNPSLinkout_Network', 'RTConsensus',
       'GNPSGROUP:Rhodococcus', 'GNPSGROUP:171', 'G5', 'ATTRIBUTE_Genus',
       'Annotated Adduct Features ID', 'precursor mass', 'Analog:Smiles',
       'Analog:GNPSLibraryURL', 'MassDiff', 'Analog:tags',
       'Analog:Library_Class', 'IonMode', 'Analog:MassDiff', 'MZErrorPPM',
       'IIN Best Ion=Library Adduct', 'Analog:Compound_Source',
       'Library_Class', 'INCHI', 'Analog:SharedPeaks',
       'Analog:IIN Best Ion=Library Adduct', 'Compound_Name', 'Ion_Source',
       'Analog:MZErrorPPM', 'SharedPeaks', 'Compound_Source', 'tags',
       'Analog:Data_Collector', 'GNPSLibraryURL', 'Analog:INCHI',
       'Analog:Instrument', 'Analog:IonMode', 'Smiles', 'Analog:Adduct', 'PI',
       'MQScore', 'Analog:Ion_Source', 'Instrument', 'Analog:Compound_Name',
       'Analog:SpectrumID', 'Data_Collector', 'Analog:PI', 'SpectrumID',
       'Adduct', 'Analog:MQScore', 'total_nodes'