In [12]:
import pandas as pd

In [13]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./b_c_genes.csv')

In [14]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [15]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [16]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [17]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(3818, 15)
    Gene Symbol  gene_score   p-value    FDR  Odd ratio  \
322       NACC1    0.100000  0.039800  0.470      0.523   
300        NFIC    0.100000  0.035100  0.445      0.512   
1          ASB6    0.100000  0.000036  0.121      0.192   
50          GK5    0.088889  0.002740  0.231      0.312   
74       ZBTB7A    0.088889  0.004170  0.231      0.333   
79          PKM    0.088889  0.005270  0.252      0.346   
129       RAB5B    0.088889  0.011300  0.333      0.394   
325        GDE1    0.088889  0.041500  0.471      0.500   
12       ZNF281    0.077778  0.000480  0.157      0.208   
82       FBXL18    0.077778  0.005890  0.258      0.319   

     Number of interactions         microRNA 1       microRNA 2  \
322                       9       hsa-miR-4472  hsa-miR-6743-3p   
300                       9       hsa-miR-4483  hsa-miR-6877-5p   
1                         9    hsa-miR-6819-5p   hsa-miR-342-5p   
50                        8       hsa-miR-5572   hsa-miR-185-3p   
74  

In [18]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

NACC1
NFIC
ASB6
GK5
ZBTB7A
PKM
RAB5B
GDE1
ZNF281
FBXL18
SETD1B
KMT2D
FOXK1
LMNB2
PRRC2B
SSBP2
RAB15
SELENON
SLC1A5
MAPK1
TMED10
MLLT1
YIPF4
CHAC1
PABPN1
PHLDA3
HS3ST1
GLUL
SKI
APOL6
CDKN1A
MPLKIP
SETD5
NAA50
FADS6
HDGF
CCND1
SLC7A5
XKR4
DSN1
HSPA4L
SCAMP4
KLHL15
MDK
SLC10A3
MPRIP
TCF23
PPIA
ISPD
ESF1
ZNF394
JPH2
RPH3AL
QSOX1
MAX
TNRC6A
TMEM43
NUDT3
UBN2
TMEM189-UBE2V1
RRP36
PFN1
DDA1
RAB10
ZNF703
PPP1R9B
TIAL1
SBNO1
TFDP2
SFT2D2
AGAP9
CNBP
FBRS
REL
SYAP1
MRPS23
DBT
SHOC2
G3BP1
NPR1
SLC35C2
DCAF7
CASTOR2
MEAF6
IL2RA
C3
NUFIP2
MDM4
CAPN15
FYTTD1
TXLNA
ARSK
CRK
AGO2
HACD4
KHSRP
SZRD1
IBA57
ABI2
PLEC
KDM6B
MKLN1
MIDN
MOGAT1
NCBP3
KMT2A
TIMM29
TMEM184B
INMT
RNF157
LRRC3C
BROX
ZNF317
TAOK1
DNAJC8
DDX39B
TLN1
SLC35E2B
CYP20A1
PGPEP1
BRPF1
YWHAZ
TXNIP
TPM3
CRKL
CSTF2
ZSCAN29
XIAP
ONECUT3
PTPN2
ULK1
ALDOA
TFPI
POMGNT1
LCOR
BCL2L1
YME1L1
DDX3X
MINOS1
ELN
UGGT1
C16orf58
C8orf58
DYRK2
YOD1
MPP2
PTGR2
SOCS7
CALCOCO2
OPTN
DCTN5
ERC1
FOLR1
KAT7
PNPLA6
MEX3A
ATP1B4
RPS6KA4
EMC3
AK3
RNF11
TRAPPC2
FEM1A

In [19]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)