In [1]:
import pandas as pd
import networkx as nx
import os

In [2]:
BASE_DIR = r"C:\Users\LENOVO\Desktop\Intern_data\Project_Result\Matrix"

In [3]:
def path(file):
    return os.path.join(BASE_DIR, file)


In [4]:
gene_matrix = pd.read_csv(path("abricate_resfinder_gene_matrix.csv"), index_col=0)
resistance_matrix = pd.read_csv(path("abricate_resfinder_resistance_matrix.csv"), index_col=0)

In [5]:
edges = pd.read_csv(path("strain_gene_network_edges.csv"))

In [6]:
metadata = pd.DataFrame(index=gene_matrix.index)
metadata.index.name = "strain"


In [7]:
metadata["species"] = metadata.index.str.extract(r"(S\.\s*\w+)", expand=False)


In [8]:
metadata["total_genes_detected"] = gene_matrix.sum(axis=1)

In [9]:
metadata["total_resistance_classes"] = resistance_matrix.sum(axis=1)

In [10]:
metadata["MDR_status"] = (metadata["total_resistance_classes"] >= 3).astype(int)

In [12]:
edges = pd.read_csv(path("strain_gene_network_edges.csv"))
print(edges.columns)

Index(['Source', 'Target', 'Species Name', 'Short'], dtype='object')


In [13]:
edges.rename(columns={"Source": "strain", "Target": "gene"}, inplace=True)


In [14]:
G = nx.from_pandas_edgelist(edges, source="strain", target="gene")


In [20]:
print("Graph created successfully!")
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())

Graph created successfully!
Nodes: 103
Edges: 327


In [15]:
degree_dict = dict(G.degree(metadata.index))
metadata["network_degree"] = metadata.index.map(degree_dict)

In [16]:
betweenness = nx.betweenness_centrality(G, normalized=True)
metadata["network_betweenness"] = metadata.index.map(betweenness)

In [25]:
print(gene_matrix.columns.tolist())

["aac(6')-aph(2'')_1", 'ant(6)-Ia_1', "aph(3')-III_1", 'cat(pC194)_1', 'erm(B)_1', 'erm(B)_10', 'erm(B)_12', 'erm(B)_23', 'lnu(C)_1', 'lsa(C)_1', 'mef(A)_2', 'mef(A)_4', 'msr(D)_2', 'tet(32)_2', 'tet(M)_11', 'tet(M)_12', 'tet(M)_13', 'tet(M)_2', 'tet(M)_4', 'tet(M)_5', 'tet(M)_7', 'tet(M)_8', 'tet(O)_1', 'tetA(46)_1', 'tetB(46)_1']


In [26]:
def find_gene_matches(gene_list, gene_matrix_columns):
    matched = []
    for g in gene_list:
        # Find matching genes containing the pattern (ignoring case)
        hits = [col for col in gene_matrix_columns if g.lower().replace("(","").replace(")","") in col.lower().replace("(","").replace(")","")]
        matched.extend(hits)
    return list(set(matched))

In [27]:
macrolide_core = ["ermB", "mefA", "mefE", "mel", "msrD"]
tetracycline_core = ["tetM", "tetO", "tet32", "tetA", "tetB"]
fluoroquinolone_core = ["patA", "patB", "pmrA"]
aminoglycoside_core = ["aad", "SAT", "AAC", "APH"]

cols = gene_matrix.columns

macrolide_genes = find_gene_matches(macrolide_core, cols)
tetracycline_genes = find_gene_matches(tetracycline_core, cols)
fluoroquinolone_genes = find_gene_matches(fluoroquinolone_core, cols)
aminoglycoside_genes = find_gene_matches(aminoglycoside_core, cols)

print("Macrolide genes matched:", macrolide_genes)
print("Tetracycline genes matched:", tetracycline_genes)
print("Fluoroquinolone genes matched:", fluoroquinolone_genes)
print("Aminoglycoside genes matched:", aminoglycoside_genes)


Macrolide genes matched: ['erm(B)_1', 'erm(B)_23', 'erm(B)_10', 'msr(D)_2', 'mef(A)_4', 'mef(A)_2', 'erm(B)_12']
Tetracycline genes matched: ['tet(M)_8', 'tet(M)_11', 'tet(M)_4', 'tet(M)_12', 'tet(32)_2', 'tet(M)_5', 'tet(M)_2', 'tet(O)_1', 'tetB(46)_1', 'tet(M)_13', 'tet(M)_7', 'tetA(46)_1']
Fluoroquinolone genes matched: []
Aminoglycoside genes matched: ["aph(3')-III_1", "aac(6')-aph(2'')_1"]


In [28]:
def count_genes(strain, genes):
    genes_present = gene_matrix.loc[strain, genes]
    return genes_present.sum()

metadata["macrolide_gene_count"] = [count_genes(s, macrolide_genes) for s in metadata.index]
metadata["tetracycline_gene_count"] = [count_genes(s, tetracycline_genes) for s in metadata.index]
metadata["fluoroquinolone_gene_count"] = [count_genes(s, fluoroquinolone_genes) for s in metadata.index]
metadata["aminoglycoside_gene_count"] = [count_genes(s, aminoglycoside_genes) for s in metadata.index]


In [29]:
print(gene_matrix.columns.tolist())


["aac(6')-aph(2'')_1", 'ant(6)-Ia_1', "aph(3')-III_1", 'cat(pC194)_1', 'erm(B)_1', 'erm(B)_10', 'erm(B)_12', 'erm(B)_23', 'lnu(C)_1', 'lsa(C)_1', 'mef(A)_2', 'mef(A)_4', 'msr(D)_2', 'tet(32)_2', 'tet(M)_11', 'tet(M)_12', 'tet(M)_13', 'tet(M)_2', 'tet(M)_4', 'tet(M)_5', 'tet(M)_7', 'tet(M)_8', 'tet(O)_1', 'tetA(46)_1', 'tetB(46)_1']


In [30]:
def count_genes(strain, gene_list):
    genes_present = gene_matrix.loc[strain, gene_list]
    return genes_present.sum()

In [31]:
metadata["macrolide_gene_count"] = [count_genes(s, macrolide_genes) for s in metadata.index]
metadata["tetracycline_gene_count"] = [count_genes(s, tetracycline_genes) for s in metadata.index]
metadata["fluoroquinolone_gene_count"] = [count_genes(s, fluoroquinolone_genes) for s in metadata.index]
metadata["aminoglycoside_gene_count"] = [count_genes(s, aminoglycoside_genes) for s in metadata.index]


In [32]:
metadata.to_csv(path("strain_metadata.csv"))
print("\nstrain_metadata.csv Created Successfully!")
print(metadata.head())


strain_metadata.csv Created Successfully!
                                                   species  \
strain                                                       
Streptococcus_australis_GCA_900476055.1_53750_F01      NaN   
Streptococcus_australis_GCA_900636505.1_42650_H02      NaN   
Streptococcus_cristatus_GCA_000385925.1_ASM38592v1     NaN   
Streptococcus_cristatus_GCA_900475445.1_42727_F01      NaN   
Streptococcus_cristatus_GCA_900478185.1_51342_H01      NaN   

                                                    total_genes_detected  \
strain                                                                     
Streptococcus_australis_GCA_900476055.1_53750_F01                      2   
Streptococcus_australis_GCA_900636505.1_42650_H02                      2   
Streptococcus_cristatus_GCA_000385925.1_ASM38592v1                     2   
Streptococcus_cristatus_GCA_900475445.1_42727_F01                      1   
Streptococcus_cristatus_GCA_900478185.1_51342_H01                 