In [16]:
import pandas as pd

In [17]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./ctl_s1_genes.csv')

In [18]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [19]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [20]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [21]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(3434, 20)
    Gene Symbol  gene_score   p-value     FDR  Odd ratio  \
6         NACC1    0.152174  0.000148  0.0742      0.344   
35         NFIC    0.130435  0.001650  0.1520      0.392   
26        SPRY4    0.097826  0.000501  0.1150      0.269   
25       SETD1B    0.097826  0.000621  0.1150      0.277   
617      CDKN1A    0.097826  0.122000  0.6430      0.651   
12       SCAMP4    0.097826  0.000379  0.1080      0.260   
78         TPM3    0.086957  0.007820  0.3400      0.369   
20         NPR1    0.086957  0.000871  0.1150      0.261   
130       KHSRP    0.086957  0.015100  0.4000      0.414   
38         NFIX    0.086957  0.001730  0.1520      0.290   

     Number of interactions        microRNA 1        microRNA 2  \
6                        14   hsa-miR-6756-5p   hsa-miR-6787-5p   
35                       12   hsa-miR-4690-5p   hsa-miR-6756-5p   
26                        9   hsa-miR-6787-5p      hsa-miR-663a   
25                        9   hsa-miR-3940-5p   hsa-miR-4758

In [22]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

NACC1
NFIC
SPRY4
SETD1B
CDKN1A
SCAMP4
TPM3
NPR1
KHSRP
NFIX
MLLT1
ALDOA
MNT
CALR
PKM
HDGF
SBF1
SLC7A5
MEF2D
PEX26
ORAI2
UBE2Q1
CTDNEP1
AGO2
SFN
CBX6
GATAD2A
ZNF556
ASB6
KDM6B
LARP1
GIGYF1
YIPF4
MED28
PDE4C
RHOA
ANKRD52
COX6B1
NUP62
ABL1
NEUROD2
SLC47A1
CRISPLD2
C3orf36
H2AFX
ZNF787
KMT2A
ZNF207
CSK
YWHAZ
NPLOC4
SKI
NF2
BCL2L1
KCNN3
FXN
HSP90AB1
SNRPD1
ZSWIM1
GDE1
METTL14
MSN
SRCIN1
TOB2
THBS1
PCGF3
MAFK
ZNF451
RNF40
ARL8A
UBXN2A
ARHGAP31
RHOB
ZNF385A
ATG9A
GNAI2
ADGRL1
BARHL1
ABHD12
PTPN9
TP53
MIDN
HOXA7
LRRC58
ATL2
SUMO1
CLCN7
AKT1S1
KMT2D
PHLDA3
DHTKD1
NAA50
CNKSR3
DIAPH1
NUFIP2
PEA15
EIF5AL1
MDK
RBM23
AP2S1
TMCO1
TRIM65
AP2M1
VAV3
VPS8
LMNB2
SF3B3
ZBTB46
FEM1B
SLC1A5
SYNGR1
NAV1
SENP2
SAMD4B
TXNDC16
PARP2
TMEM170A
SOCS7
REPIN1
PTGR2
B3GALNT2
C16orf58
MPP2
AFG1L
ZNF652
FBRS
DDI2
CASP16P
EFHD2
CCNT1
ABI2
UNK
TLN1
SELENON
TNRC6A
PPP6R1
MCFD2
ZNF641
CRCP
SLC35C2
FBXW8
TRAF1
NUCB1
DDA1
ITPRIPL2
TNRC6B
RABGAP1L
TEX261
TERF2
TSPAN14
FSCN1
CDCA4
MAPK1
TOR4A
CDC14B
CAPN15
FEM1A
BAZ2A
SBK1
TXN

In [23]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)