In [32]:
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
import os

In [33]:
def map_mirnas_to_genes(mirnas, mir_tarbase):
    target_genes = set()
    for mirna in mirnas:
        targets = mir_tarbase[mir_tarbase['miRNA'] == mirna]['Target Gene']
        target_genes.update(targets)
    return list(target_genes)

In [34]:
def create_pathways_from_mir_tarbase(mir_tarbase):
    pathways = {}
    for index, row in mir_tarbase.iterrows():
        gene = row['Target Gene']
        mirna = row['miRNA'].upper()
        if gene not in pathways:
            pathways[gene] = []
        pathways[gene].append(mirna)
    return pathways


In [35]:
def perform_gsea(data, pathways, output_dir):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    ranked_data = data.sort_values(by='Importance', ascending=False)
    ranked_data['Feature'] = ranked_data['Feature'].str.upper()
    ranked_data.to_csv('feature_importance.rnk', sep='\t', index=False, header=False)

    with open('mirna_pathways.gmt', 'w') as f:
        for gene, mirnas in pathways.items():
            f.write(f"{gene}\tna\t" + "\t".join(mirnas) + "\n")

    pre_res = gp.prerank(rnk='feature_importance.rnk',
                         gene_sets='mirna_pathways.gmt',
                         outdir=output_dir,
                         permutation_num=100,
                         min_size=5,
                         max_size=2000,
                         seed=42)

    for term in pre_res.res2d.index[:10]:
        gp.plot.gseaplot(rank_metric=pre_res.ranking, term=term, **pre_res.results[term], ofname=f'{output_dir}/gseaplot_{term}.png')

    return pre_res

In [36]:
def perform_enrichr(data, mir_tarbase, output_dir):
    mirnas = data['Feature'].str.upper().tolist()
    gene_list = map_mirnas_to_genes(mirnas, mir_tarbase)
    print("Gene list for Enrichr (first 10 genes):")
    print(gene_list[:10])
    
    try:
        enrichr_res = gp.enrichr(
            gene_list=gene_list,
            gene_sets=[
                'KEGG_2021_Human',
                'GO_Biological_Process_2021',
                'GO_Molecular_Function_2021',
                'GO_Cellular_Component_2021',
                'Reactome_2022',
                'WikiPathways_2021_Human'
            ],
            outdir=output_dir,
            organism='human',
            cutoff=0.05
        )
        
        print("Enrichr results (first 10 rows):")
        print(enrichr_res.res2d.head(10))

    except Exception as e:
        print(f"Error performing Enrichr: {e}")
        if 'Warning: No enrich terms when cutoff' in str(e):
            print("Try lowering the cutoff value or verify the gene list and gene sets.")
        
    return enrichr_res

In [38]:

# Define file paths
s1_s2_path = '../GSEA/miRNA/50_s1_s2.csv'
s2_s3_path = '../GSEA/miRNA/50_s2_s3.csv'
s3_s4_path = '../GSEA/miRNA/50_s3_s4.csv'
mti_data_path = './miRTarBase MTI.xlsx'  # Update with the correct path

# Load miRTarBase data
mir_tarbase = pd.read_excel(mti_data_path)  # Ensure this path is correct

# Filter to include only human miRNAs and target genes
mir_tarbase = mir_tarbase[mir_tarbase['Species (miRNA)'] == 'Homo sapiens']

# Create pathways
pathways = create_pathways_from_mir_tarbase(mir_tarbase)

# Print a sample of the pathways
print("Sample pathways (first 5 genes):")
for gene, mirnas in list(pathways.items())[:5]:
    print(f"{gene}: {mirnas}")

# Load the CSV files
datasets = {
    's1_s2': pd.read_csv(s1_s2_path),
    's2_s3': pd.read_csv(s2_s3_path),
    's3_s4': pd.read_csv(s3_s4_path)
}

Sample pathways (first 5 genes):
HIF1A: ['HSA-MIR-20A-5P', 'HSA-MIR-20A-5P', 'HSA-MIR-20A-5P', 'HSA-MIR-519C-3P', 'HSA-MIR-20B-5P', 'HSA-MIR-20B-5P', 'HSA-MIR-20B-5P', 'HSA-MIR-20B-5P', 'HSA-MIR-107', 'HSA-MIR-18A-5P', 'HSA-MIR-18A-5P', 'HSA-MIR-18A-5P', 'HSA-MIR-18A-5P', 'HSA-MIR-18A-5P', 'HSA-MIR-17-5P', 'HSA-MIR-17-5P', 'HSA-MIR-17-5P', 'HSA-MIR-424-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-199A-5P', 'HSA-MIR-138-5P', 'HSA-MIR-138-5P', 'HSA-MIR-199B-5P', 'HSA-MIR-33A-5P', 'HSA-MIR-33A-5P', 'HSA-MIR-106B-3P', 'HSA-LET-7B-5P', 'HSA-MIR-210-3P', 'HSA-MIR-3121-3P', 'HSA-MIR-3668', 'HSA-MIR-6815-3P', 'HSA-MIR-589-3P', 'HSA-MIR-889-3P', 'HSA-MIR-935', 'HSA-MIR-1910-3P', 'HSA-MIR-3074-3P', 'HSA-MIR-5094', 'HSA-MIR-6511A-5P', 'HSA-MIR-6755-5P', 'HSA-MIR-6806-5P', 'HSA-MIR-93-5P', 'HSA-MIR-93-5P', 'HSA-MIR-93-5P', 'HSA-MIR-106A-5P', 'HSA-MIR-106B-5P', 'HSA-MIR-106B-5P', 'HSA-MIR-519D-3P', 'HSA-MIR-4282', 'HSA-MIR-8055', 'HSA-

In [None]:
for name, data in datasets.items():
    print(f"Performing GSEA for {name}...")
    perform_gsea(data, pathways, f'GSEA_results/{name}')

    print(f"Performing Enrichr for {name}...")
    perform_enrichr(data, mir_tarbase, f'Enrichr_results/{name}')

Performing GSEA for s1_s2...
Error plotting term 0: 0
Error plotting term 1: 1
Performing Enrichr for s1_s2...
Gene list for Enrichr (first 10 genes):
[]




Error performing Enrichr: Error sending gene list, try again later


UnboundLocalError: local variable 'enrichr_res' referenced before assignment