In [38]:

import pandas as pd
import mygene

# Load the ranked gene list with Ensembl IDs
df = pd.read_csv("ranked_gene_list1.rnk", sep="\t", header=None, names=["ensembl", "score"])

# Remove version numbers from Ensembl IDs (e.g., ENSG00000147256.12 -> ENSG00000147256)
df["ensembl"] = df["ensembl"].str.split("\\.").str[0]

# Query MyGene.info to map Ensembl → gene symbol
mg = mygene.MyGeneInfo()
query = mg.querymany(df["ensembl"].tolist(), scopes="ensembl.gene", fields="symbol", species="human")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
10 input query terms found dup hits:	[('ENSG00000226519', 2), ('ENSG00000276241', 4), ('ENSG00000261600', 2), ('ENSG00000249738', 2), ('E
408 input query terms found no hit:	['ENSG00000273123', 'ENSG00000260923', 'ENSG00000286911', 'ENSG00000287400', 'ENSG00000260163', 'ENS


Genes not found: Empty DataFrame
Columns: [query, _id, _score, symbol, notfound]
Index: []
Empty DataFrame
Columns: [symbol, score]
Index: []


In [41]:

# Convert to DataFrame and merge
mapped = pd.DataFrame(query)
print(mapped.head(10))

# Check for missing values or notfound genes
notfound_genes = mapped[mapped['notfound'] == False]
print(f"Genes not found: {notfound_genes}")

# Filter out notfound genes and merge with the original data
#mapped = mapped[mapped['notfound'] == True][['query', 'symbol']].dropna()
merged = df.merge(mapped, left_on="ensembl", right_on="query").drop(columns=["query"])

# Final ranked list: gene symbol + score
final_df = merged[["symbol", "score"]]

# Sort by score in descending order
final_df = final_df.sort_values("score", ascending=False)

# Save the final ranked list
final_df.to_csv("ranked_gene_list_symbols.rnk", sep="\t", index=False, header=False)

# Optionally, print the final dataframe
print(final_df.head())




             query              _id     _score      symbol notfound
0  ENSG00000147256           158763  32.865475    ARHGAP36      NaN
1  ENSG00000235689  ENSG00000235689  32.865475         NaN      NaN
2  ENSG00000235984        100873969  32.865475    GPC5-AS1      NaN
3  ENSG00000132437             1644  32.865475         DDC      NaN
4  ENSG00000204544           394263  32.865475       MUC21      NaN
5  ENSG00000282639        102723170  32.865475   IGHV3-64D      NaN
6  ENSG00000105388             1048  32.865475     CEACAM5      NaN
7  ENSG00000282651  ENSG00000282651  32.865475  IGHV5-10-1      NaN
8  ENSG00000134258            79679  32.865475       VTCN1      NaN
9  ENSG00000187714             6572  32.865475     SLC18A3      NaN
Genes not found: Empty DataFrame
Columns: [query, _id, _score, symbol, notfound]
Index: []
     symbol      score
0  ARHGAP36  13.883519
1       NaN  11.368943
2  GPC5-AS1  10.930820
3       DDC  10.412914
4     MUC21  10.379204


In [42]:
# Inspect the first few rows to understand the structure
print(final_df)


         symbol      score
0      ARHGAP36  13.883519
1           NaN  11.368943
2      GPC5-AS1  10.930820
3           DDC  10.412914
4         MUC21  10.379204
...         ...        ...
24268       NaN -11.047726
24269    TUBBP5 -12.241712
24270    PSPHP1 -13.143629
24271       NaN -21.147364
24272  SIGLEC14 -23.068357

[24273 rows x 2 columns]


In [43]:
import gseapy as gp

pre_res = gp.prerank(
    rnk="ranked_gene_list_symbols.rnk",
    gene_sets="KEGG_2019_Human",
    outdir="gsea_results",
    permutation_num=100,
    seed=2,
    min_size=5,  # Lower the minimum size of gene sets
    max_size=500,  # Increase the maximum size of gene sets
    verbose=True
)




2025-04-22 02:17:20,719 [INFO] Input gene rankings contains duplicated IDs
The order of those genes will be arbitrary, which may produce unexpected results.
2025-04-22 02:17:20,727 [INFO] Parsing data files for GSEA.............................
2025-04-22 02:17:20,756 [INFO] Enrichr library gene sets already downloaded in: /uufs/chpc.utah.edu/common/home/u0962361/.cache/gseapy, use local file
2025-04-22 02:17:20,772 [INFO] 0003 gene_sets have been filtered out when max_size=500 and min_size=5
2025-04-22 02:17:20,772 [INFO] 0305 gene_sets used for further statistical testing.....
2025-04-22 02:17:20,773 [INFO] Start to run GSEA...Might take a while..................
2025-04-22 02:17:26,360 [INFO] Congratulations. GSEApy runs successfully................

