In [1]:
from scipy.stats import hypergeom
from collections import Counter
from statsmodels.stats.multitest import multipletests


def kegg_brite_enrichment(pvalue,df, refined=True,filtered=[]):
    col = "pvalue_refined" if refined else "pvalue"
    df_cut = df[df[col] < pvalue]
    kegg_brite = df_cut["KEGG_brite"].str.split(", ").tolist()
    KEGG_brite_terms = [item for sublist in kegg_brite for item in sublist if not any(f in item for f in filtered)]
    KEGG_brite_background = df["KEGG_brite"].str.split(", ").tolist()
    KEGG_brite_background = [item.strip() for sublist in KEGG_brite_background for item in sublist]
    KEGG_BRITE_counts = Counter(KEGG_brite_terms)
    KEGG_BRITE_background_counts = Counter(KEGG_brite_background)
    M = len(dati)  # Total number of genes
    n = len(dati[dati[col]<pvalue])  # Number of genes with the motif at p-value < pvalue

    p_values = {}
    enrichments = {}
    genes = {}
    # Ensure KEGG_BRITE column is of string type
    df['KEGG_brite'] = df['KEGG_brite'].astype(str)
    print(n,M)
    # Continue with your code...
    for term in KEGG_BRITE_counts:
        N = KEGG_BRITE_background_counts[term]
        x = KEGG_BRITE_counts[term]
        print(f"Term: {term}, x: {x}, n: {n}, N: {N}, M: {M}")
        enrichments[term] = (x / n) / (N / M)
        p_values[term] = hypergeom.sf(x - 1, M, N, n)
        mask = df_cut['KEGG_brite'].str.contains(term)
        filtered_df = df_cut[mask][['protein_id', 'product']]
        filtered_df['combined'] = filtered_df.apply(lambda row: f"{str(row['protein_id'])}({str(row['product'])})", axis=1)
        genes[term] = filtered_df['combined'].str.cat(sep=', ')

    # Correct p-values for multiple testing
    terms = list(p_values.keys())
    pvals = list(p_values.values())
    corrected_pvals_bonferroni = multipletests(pvals, method='bonferroni')[
        1]  # Change 'bonferroni' to the method you prefer
    corrected_pvals_fdr = multipletests(pvals, method='fdr_bh')[1]  # Change 'fdr_bh' to the method you prefer

    data = {'KEGG BRITE term': terms, 'p-value': pvals, 'corrected p-value Bonferroni': corrected_pvals_bonferroni,
            'corrected p-value FDR Benjamini/Hochberg': corrected_pvals_fdr}
    dfKEGG_BRITE = pd.DataFrame(data)
    dfKEGG_BRITE['enrichment'] = dfKEGG_BRITE['KEGG BRITE term'].map(enrichments)
    dfKEGG_BRITE["genes"] = dfKEGG_BRITE["KEGG BRITE term"].map(genes)
    dfKEGG_BRITE = dfKEGG_BRITE.sort_values(by='p-value', ascending=True)
    return dfKEGG_BRITE


In [15]:
import pandas as pd
from scipy.stats import hypergeom
from collections import Counter
from statsmodels.stats.multitest import multipletests

def kegg_brite_enrichment2(pvalue,df, refined=True, filtered=[]):
    col = "pvalue_refined" if refined else "pvalue"
    df_cut = df[df[col] < pvalue]

    # Process KEGG BRITE terms
    kegg_brite = df_cut["KEGG_brite"].str.split(", ").explode()
    kegg_brite = kegg_brite[~kegg_brite.isin(filtered)].str.replace("\n", "")
    KEGG_brite_terms = kegg_brite.tolist()

    KEGG_brite_background = df["KEGG_brite"].str.split(", ").explode()
    KEGG_brite_background = KEGG_brite_background[~KEGG_brite_background.isin(filtered)].str.replace("\n", "")
    KEGG_brite_background = KEGG_brite_background.tolist()

    # Count occurrences
    KEGG_brite_counts = Counter(KEGG_brite_terms)
    KEGG_brite_background_counts = Counter(KEGG_brite_background)

    M = len(dati)
    n = len(df_cut)

    p_values = {}
    enrichments = {}
    genes = {}

    # Ensure KEGG_brite column is of string type
    df['KEGG_brite'] = df['KEGG_brite'].astype(str)

    for term, x in KEGG_brite_counts.items():
        N = KEGG_brite_background_counts[term]
        print(f"Term: {term}, x: {x}, n: {n}, N: {N}, M: {M}")
        enrichments[term] = (x / n) / (N / M)
        p_values[term] = hypergeom.sf(x - 1, M, N, n)

        mask = df_cut['KEGG_brite'].str.contains(term)
        filtered_df = df_cut[mask][['protein_id', 'product']]
        filtered_df['combined'] = filtered_df.apply(lambda row: f"{row['protein_id']}({row['product']})", axis=1)
        genes[term] = ', '.join(filtered_df['combined'].astype(str))

    # Correct p-values for multiple testing
    terms = list(p_values.keys())
    pvals = list(p_values.values())
    corrected_pvals_bonferroni = multipletests(pvals, method='bonferroni')[1]
    corrected_pvals_fdr = multipletests(pvals, method='fdr_bh')[1]

    data = {
        'KEGG BRITE term': terms,
        'p-value': pvals,
        'corrected p-value Bonferroni': corrected_pvals_bonferroni,
        'corrected p-value FDR Benjamini/Hochberg': corrected_pvals_fdr
    }

    dfKEGG_brite = pd.DataFrame(data)
    dfKEGG_brite['enrichment'] = dfKEGG_brite['KEGG BRITE term'].map(enrichments)
    dfKEGG_brite['genes'] = dfKEGG_brite['KEGG BRITE term'].map(genes)
    dfKEGG_brite = dfKEGG_brite.sort_values(by='p-value', ascending=True)

    return dfKEGG_brite

In [2]:
import pandas as pd
dati=pd.read_csv("/home/davide/Desktop/operoniscorenew.csv")
dfKEGG=dati.dropna(subset=["KEGG_brite"]).copy()
dfpathways=dati.dropna(subset=["pathways"]).copy()

In [31]:

from scipy.stats import hypergeom
from collections import Counter
from statsmodels.stats.multitest import multipletests
from scipy.stats import false_discovery_control

def kegg_pathways_enrichment(pvalue,df,refined=True,filtered=[]):

    col="pvalue_refined" if refined else "pvalue"
    df_cut=df[df[col]<pvalue]
    kegg_pathways = df_cut["pathways"].str.split(", ").tolist()
    kegg_pathways = [item for sublist in kegg_pathways for item in sublist if not any(f in item for f in filtered)]
    kegg_pathways=[p.replace("\n","") for p in kegg_pathways]
    KEGG_pathways_terms = kegg_pathways
    KEGG_pathways_background = df["pathways"].str.split(", ").tolist()
    KEGG_pathways_background = [item for sublist in KEGG_pathways_background for item in sublist if not any(f in item for f in filtered) and item!="None"]
    KEGG_pathways_background=[p.replace("\n","") for p in KEGG_pathways_background]
    
    KEGG_pathways_counts = Counter(KEGG_pathways_terms)
    KEGG_pathways_background_counts = Counter(KEGG_pathways_background)
    M=len(dati)
    n=len(dati[dati[col]<pvalue])
    p_values = {}
    enrichments = {}
    genes = {}
    # Ensure KEGG_pathways column is of string type
    df['pathways'] = df['pathways'].astype(str)
    print(M,n)
    for term in KEGG_pathways_counts:
        x=KEGG_pathways_counts[term]
        N=KEGG_pathways_background_counts[term]
        enrichments[term] = (x/n)/(N/M)
        p_values[term] = hypergeom.sf(x-1, M, N, n)
        mask = df_cut['pathways'].str.contains(term)
        filtered_df = df_cut[mask][['protein_id', 'product']]
        filtered_df['combined'] = filtered_df.apply(lambda row: f"{str(row['protein_id'])}({str(row['product'])})", axis=1)
        filtered_df['combined'] = filtered_df['combined'].astype(str)
        genes[term] = filtered_df['combined'].str.cat(sep=', ')
    
    # Correct p-values for multiple testing
    terms = list(p_values.keys())
    pvals = list(p_values.values())
    corrected_pvals_bonferroni = multipletests(pvals, method='bonferroni')[1]
    corrected_pvals_fdr = multipletests(pvals, method='fdr_bh')[1]
    
    data = {'KEGG Pathway term': terms, 'p-value': pvals, 'corrected p-value Bonferroni': corrected_pvals_bonferroni, 'corrected p-value FDR Benjamini/Hochberg': corrected_pvals_fdr}
    dfKEGG_pathways = pd.DataFrame(data)
    dfKEGG_pathways['enrichment'] = dfKEGG_pathways['KEGG Pathway term'].map(enrichments)
    dfKEGG_pathways["genes"] = dfKEGG_pathways["KEGG Pathway term"].map(genes)
    dfKEGG_pathways = dfKEGG_pathways.sort_values(by='p-value', ascending=True)
    return (dfKEGG_pathways,KEGG_pathways_background)

In [32]:
with open("/home/davide/Desktop/filteredpathways.txt", "r") as f:
    filtered = f.readlines()
filtered= [x.strip() for x in filtered]
dfKEGG_pathways,back=kegg_pathways_enrichment(0.01,dfpathways,refined=True,filtered=filtered)
dfKEGG_pathways

4635 88


Unnamed: 0,KEGG Pathway term,p-value,corrected p-value Bonferroni,corrected p-value FDR Benjamini/Hochberg,enrichment,genes
0,Homologous recombination,0.000466,0.013047,0.013047,10.534091,"WP_250121493.1(recombinase RecA), WP_250122517..."
3,DNA replication,0.001285,0.035993,0.017996,13.167614,WP_250122517.1(single-stranded DNA-binding pro...
6,Thiamine metabolism,0.020788,0.582058,0.194019,8.778409,"WP_250123667.1(thiamine-phosphate kinase), WP_..."
4,Mismatch repair,0.044781,1.0,0.196928,5.852273,WP_250122517.1(single-stranded DNA-binding pro...
10,Carbon fixation pathways in prokaryotes,0.064407,1.0,0.196928,4.788223,"WP_250121210.1(acetate kinase), WP_250125001.1..."
12,Taurine and hypotaurine metabolism,0.073832,1.0,0.196928,13.167614,WP_250121210.1(acetate kinase)
20,Cysteine and methionine metabolism,0.075117,1.0,0.196928,4.389205,"WP_250125000.1(adenosylhomocysteinase), WP_250..."
25,Coumarin biosynthesis,0.091431,1.0,0.196928,10.534091,WP_250125002.1(methionine adenosyltransferase)
24,Gramine biosynthesis,0.091431,1.0,0.196928,10.534091,
23,Benzoxazinoid biosynthesis,0.091431,1.0,0.196928,10.534091,WP_250125002.1(methionine adenosyltransferase)


In [33]:
#open /home/davide/Desktop/filteredpathways.txt and make a list of the string for every line
with open("/home/davide/Desktop/filteredpathways.txt", "r") as f:
    filtered = f.readlines()
filtered= [x.strip() for x in filtered]
dfKEGG_pathways,back=kegg_pathways_enrichment(0.01,dfpathways,refined=False,filtered=filtered)
dfKEGG_pathways.round(6).to_excel("/home/davide/Desktop/KEGG_pathways_enrichment01new.xlsx",index=False)
dfKEGG_pathways,back=kegg_pathways_enrichment(0.05,dfpathways,refined=False,filtered=filtered)
dfKEGG_pathways.round(6).to_excel("/home/davide/Desktop/KEGG_pathways_enrichment05new.xlsx",index=False)
dfKEGG_pathways,back=kegg_pathways_enrichment(0.01,dfpathways,refined=True,filtered=filtered)
dfKEGG_pathways.round(6).to_excel("/home/davide/Desktop/KEGG_pathways_enrichment01refinednew.xlsx",index=False)
dfKEGG_pathways,back=kegg_pathways_enrichment(0.05,dfpathways,refined=True,filtered=filtered)
dfKEGG_pathways.round(6).to_excel("/home/davide/Desktop/KEGG_pathways_enrichment05refinednew.xlsx",index=False)


4635 86
4635 281
4635 88
4635 235


In [7]:
KEGGbrite05=kegg_brite_enrichment(0.05,dfKEGG,refined=True,filtered=["Mitochondrial"])
KEGGbrite01=kegg_brite_enrichment(0.01,dfKEGG,refined=True,filtered=["Mitochondrial"])
#transform all values to at max 6 decimal places
KEGGbrite05=KEGGbrite05.round(6)
KEGGbrite01=KEGGbrite01.round(6)
KEGGbrite05.to_csv("/home/davide/Desktop/KEGGbrite_enrichment05new.csv",index=False)
KEGGbrite01.to_csv("/home/davide/Desktop/KEGGbrite_enrichment01new.csv",index=False)

235 4635
Term: ko03400 DNA repair and recombination proteins, x: 9, n: 235, N: 57, M: 4635
Term: ko01002 Peptidases and inhibitors, x: 6, n: 235, N: 59, M: 4635
Term: ko03009 Ribosome biogenesis, x: 2, n: 235, N: 37, M: 4635
Term: ko03032 DNA replication proteins, x: 5, n: 235, N: 19, M: 4635
Term: ko01504 Antimicrobial resistance genes, x: 1, n: 235, N: 6, M: 4635
Term: ko03016 Transfer RNA biogenesis, x: 2, n: 235, N: 66, M: 4635
Term: ko02048 Prokaryotic defense system, x: 3, n: 235, N: 38, M: 4635
Term: ko03036 Chromosome and associated proteins, x: 3, n: 235, N: 39, M: 4635
Term: ko01009 Protein phosphatases and associated proteins, x: 1, n: 235, N: 3, M: 4635
Term: ko04147 Exosome, x: 2, n: 235, N: 43, M: 4635
Term: ko02000 Transporters, x: 10, n: 235, N: 305, M: 4635
Term: ko01005 Lipopolysaccharide biosynthesis proteins, x: 1, n: 235, N: 16, M: 4635
Term: ko03011 Ribosome, x: 2, n: 235, N: 54, M: 4635
Term: ko03019 Messenger RNA biogenesis, x: 1, n: 235, N: 17, M: 4635
Term: ko

In [6]:
KEGGbrite05

Unnamed: 0,KEGG BRITE term,p-value,corrected p-value Bonferroni,corrected p-value FDR Benjamini/Hochberg,enrichment,genes
0,ko03400 DNA repair and recombination proteins,0.00202,0.04444,0.022864,3.114222,"WP_250121493.1(recombinase RecA), WP_250121220..."
3,ko03032 DNA replication proteins,0.002079,0.045727,0.022864,5.19037,WP_250122517.1(single-stranded DNA-binding pro...
1,ko01002 Peptidases and inhibitors,0.0765,1.0,0.560999,2.00577,WP_250121220.1(transcriptional repressor LexA)...
8,ko01009 Protein phosphatases and associated pr...,0.144552,1.0,0.795034,6.574468,WP_250125000.1(adenosylhomocysteinase)
4,ko01504 Antimicrobial resistance genes,0.268284,1.0,0.943687,3.287234,WP_250121209.1(phosphotransferase)
6,ko02048 Prokaryotic defense system,0.302748,1.0,0.943687,1.557111,WP_250125134.1(cysteine desulfurase family pro...
7,ko03036 Chromosome and associated proteins,0.316881,1.0,0.943687,1.517185,"WP_250124608.1(Dps family protein), WP_2501216..."
20,ko03012 Translation factors,0.517847,1.0,0.943687,1.408815,WP_250123877.1(ribosome recycling factor)
11,ko01005 Lipopolysaccharide biosynthesis proteins,0.565645,1.0,0.943687,1.232713,WP_250123003.1(glycosyltransferase family 2 pr...
2,ko03009 Ribosome biogenesis,0.566905,1.0,0.943687,1.06613,WP_250121246.1(16S rRNA (adenine(1518)-N(6)/ad...
