In [2]:
import pandas as pd

In [3]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./ctl_c_genes.csv')

In [4]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [5]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [6]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [7]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(3486, 17)
    Gene Symbol  gene_score   p-value    FDR  Odd ratio  \
47        NACC1    0.127907  0.004400  0.326      0.419   
117        NFIC    0.116279  0.011300  0.410      0.450   
320      CDKN1A    0.116279  0.045400  0.493      0.560   
25        ORAI2    0.116279  0.002370  0.307      0.362   
3          NPR1    0.104651  0.000112  0.126      0.222   
241      SLC7A5    0.104651  0.032100  0.438      0.504   
9        SETD1B    0.104651  0.000438  0.159      0.265   
20       SCAMP4    0.093023  0.001350  0.230      0.279   
54     CRISPLD2    0.093023  0.005720  0.356      0.351   
11       ZNF556    0.093023  0.000476  0.159      0.239   

     Number of interactions       microRNA 1        microRNA 2  \
47                       11  hsa-miR-6756-5p   hsa-miR-6787-5p   
117                      10  hsa-miR-4690-5p   hsa-miR-6756-5p   
320                      10    hsa-miR-1260b   hsa-miR-6802-5p   
25                       10     hsa-miR-320a  hsa-miR-6769a-5p   
3        

In [10]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

NACC1
NFIC
CDKN1A
ORAI2
NPR1
SLC7A5
SETD1B
SCAMP4
CRISPLD2
ZNF556
ALDOA
SPRY4
KHSRP
CALR
GIGYF1
ASB6
CSK
PEX26
LRRC58
SNRPD1
MLLT1
NFIX
MAFK
YIPF4
ABI2
MEF2D
AGO2
ZNF207
TPM3
PDE4C
PKM
SLC47A1
SBF1
SFN
NEUROD2
ARHGAP31
DSN1
PPP6R1
SRCIN1
ARHGDIA
HMGB1
ATL2
PMPCA
HDGF
CNNM4
MNT
PCGF3
ZSWIM1
COX6B1
MIDN
MRPS16
LARP1
ABHD12
PTPN9
METTL14
NF2
STMN3
ZNF787
BCL2L1
LDLR
ZNF385A
GDI1
TRAF6
B3GALNT2
C3
ABL1
ZNF451
DIAPH1
RHOB
CCNT1
CDC14B
CRCP
UGGT1
TP53
C3orf36
PTGR2
PRKAR2A
PHLDA3
UBE2Q1
H2AFX
GATAD2A
FSCN1
SAR1A
TMCO1
GTPBP10
PEA15
NUP62
NQO2
GPRC5A
GTF3C6
SOCS7
ONECUT3
F2
LIX1L
SLC27A1
YWHAZ
TOB2
ZNF573
MPP2
ZNF652
MDK
TRIM72
NUDT3
DHTKD1
REXO1
CNKSR3
GRB2
PTGES2
NAA50
RPH3AL
TRPM7
ANP32B
MRI1
ZBTB3
SIGLEC9
HAUS3
FEM1A
PTPRF
TNRC6B
WDR92
ANKRD52
MED28
CTDNEP1
POLR2E
TSPAN14
DDA1
ARL8A
KCNN3
ADGRL1
ZNF516
MCFD2
TERF2
REPIN1
TIMM29
GXYLT2
RNF40
SENP2
SLC43A2
EXOSC2
TOMM40
VPS8
IP6K1
GPAT4
DNAJC10
KDM6B
PLEKHG2
LMNB2
SELENON
RAB11B
UBXN2A
TMEM170A
TRUB2
KMT2A
TCF3
OTUB1
AFG1L
HCFC1
HRNR
HIST1H

In [9]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)