In [1]:
import pandas as pd

In [2]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s2_s3_genes.csv')

In [3]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [4]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [5]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [6]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(877, 13)
    Gene Symbol  gene_score   p-value    FDR  Odd ratio  \
121       CELF1    0.125000  0.005260  0.592      0.320   
363       MED28    0.125000  0.004590  0.592      0.312   
0        OTUD7B    0.107143  0.000970  0.539      0.201   
2          PIGO    0.107143  0.000653  0.539      0.187   
418       NUDT3    0.107143  0.012400  0.592      0.334   
9        KDELR1    0.107143  0.001660  0.585      0.223   
570         SKI    0.107143  0.021600  0.592      0.377   
504       RAD51    0.107143  0.003950  0.592      0.264   
560      SETD1B    0.089286  0.015600  0.592      0.304   
225       FOXK1    0.089286  0.140000  0.592      0.558   

     Number of interactions        microRNA 1       microRNA 2  \
121                       7   hsa-miR-1229-3p   hsa-miR-412-3p   
363                       7   hsa-miR-6829-3p  hsa-miR-4722-3p   
0                         6   hsa-miR-1229-3p  hsa-miR-371b-5p   
2                         6   hsa-miR-371b-5p  hsa-miR-6829-3p   
418       

In [7]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

CELF1
MED28
OTUD7B
PIGO
NUDT3
KDELR1
SKI
RAD51
SETD1B
FOXK1
HNRNPA1
KMT2D
OCIAD2
PEX26
BMPR1A
WIPF2
TIMM8A
ZNF460
KLHL36
SAMD8
FOSL2
HMGN2
HMGA1
GIGYF1
GATA6
ZNF264
DCAF7
FEM1A
IKZF3
CYP20A1
COX6B1
CALR
C12orf49
YOD1
KIF3A
IPP
RNF40
KLHL11
LRIG2
LRRC3C
MAFK
TOR4A
MIDN
BCL2L1
TMEM154
ORC6
PARD6B
PARP15
PKD1
SFT2D2
MMAB
PAICS
MAPKAPK2
SCOC
ZNF445
PAK4
SRARP
MAPK8
TNPO2
WDR12
ZNF195
ZMAT3
ZFP69B
HNRNPUL1
ZBTB3
YTHDF1
AKAP11
IMP4
INAFM1
TMOD3
MAP3K2
ITGA2
KCNK5
ACVR2B
KCNMB1
KIAA1456
GTF2H5
KIF5B
KLF2
UTP6
KNSTRN
LAX1
LLGL1
TRAPPC2
TRAF7
TRAF3IP1
VHL
HSPE1-MOB4
GSR
ZNRF3
CNBP
ETNK1
EXTL3
FAM151B
FAM153B
SLC4A4
FAM229B
FAM241A
FBXL18
TUBB2A
FBXO47
FGF19
FHL2
ZSWIM4
ALDH6A1
MBOAT2
FOXJ3
ZNF860
FSCN1
ZNF8
GAPVD1
ZNF641
GJD3
GK5
GLG1
ZNF556
GPC4
ZNF417
ZNF347
GRB2
TMEM44
TMEM151A
KCNJ12
EIF5AL1
PARP2
PCBD2
PCLAF
PDE12
PDLIM5
PDP2
SHMT1
PHAX
PHB2
SETBP1
PLAGL2
SELENON
POC1A
SEC22C
POLR2E
SCAMP4
RRP36
PRKCA
RPL7L1
RPL41
PTMA
PTPA
RPL37
PXMP4
RABGAP1L
RNF19B
RNF149
RNF11
RILPL1
ABCG8
SLC1A5
P2RY2

In [8]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)