In [2]:
import pandas as pd

In [3]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./ctl_b_genes.csv')

In [4]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [5]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [6]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [7]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(3090, 17)
    Gene Symbol  gene_score   p-value     FDR  Odd ratio  \
146      CDKN1A    0.122222  0.022100  0.4640      0.521   
52         NFIC    0.122222  0.004460  0.2510      0.419   
57        NACC1    0.122222  0.005300  0.2730      0.428   
3       GATAD2A    0.100000  0.000046  0.0243      0.198   
50        ALDOA    0.100000  0.004340  0.2510      0.366   
41        PEX26    0.100000  0.002790  0.1950      0.342   
43       SETD1B    0.088889  0.002400  0.1950      0.305   
66        MLLT1    0.088889  0.006830  0.3010      0.361   
377      SLC7A5    0.088889  0.085300  0.6970      0.580   
106       KHSRP    0.088889  0.013200  0.3790      0.405   

     Number of interactions       microRNA 1        microRNA 2  \
146                      11  hsa-miR-7110-5p   hsa-miR-6802-5p   
52                       11     hsa-miR-6088   hsa-miR-4687-5p   
57                       11  hsa-miR-6787-5p      hsa-miR-663a   
3                         9     hsa-miR-4648    hsa-miR-92b-5p  

In [8]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

CDKN1A
NFIC
NACC1
GATAD2A
ALDOA
PEX26
SETD1B
MLLT1
SLC7A5
KHSRP
PDE4C
ZNF556
CSK
SCAMP4
ARHGDIA
ORAI2
GDE1
CNKSR3
YIPF4
SBF1
NEUROD2
CBX8
CALR
KDM6B
C3orf36
NFIX
PKM
CLCN7
SKI
ABL1
ARL8A
RHOB
UNK
TPM3
FSCN1
SPRY4
OTUB1
SLC47A1
STMN3
ZNF516
LMNB2
ZNF385A
ZNF207
CRISPLD2
ZNF787
PHLDA3
NPR1
CTDNEP1
ASB16
LRRC58
SRCIN1
PTPRF
MAPK1
RHOA
TOR4A
SBK1
SFN
GIGYF1
FXN
ASB6
RAB11B
NRGN
CASTOR2
LARP1
PBX2
MEF2D
NPLOC4
PCGF3
POLR2E
SUMO1
MAFK
H2AFX
MIDN
COX6B1
GNAI2
HDGF
AGO2
SP110
NAA50
SYNGR1
TP53
ATL2
AKT1S1
BARHL1
DHTKD1
MNT
POU3F1
REPIN1
ARHGAP31
FOXK1
GXYLT2
TMPPE
MTSS1L
TSPYL1
THBS2
ZC3H7B
TRIM72
CDCA4
SNRPD1
KCNN3
TEX261
VAV3
MRPS16
PARP2
TOMM40
NPTXR
CASP16P
HOXB6
REXO1
RPS6KA4
UGGT1
TXNDC16
TRIM65
NUDT3
GTF3C6
SLC10A7
PTGR2
C12orf49
SENP2
SLC43A2
AFG1L
EIF5AL1
GTPBP10
GLUL
IKZF3
PRRC2B
TVP23C
BCL2L11
TRIM28
UBE2Q1
TOB2
SRD5A1
ANKRD52
MICAL2
GRB2
BCL2L1
STX4
TERF2
KCNK3
BTF3L4
DOT1L
VPS18
TRAF1
CNBP
CRCP
CACNG8
MTA1
MSI1
GPR20
PRKAR2A
HPCAL1
CBX6
ZCCHC3
PIAS4
MYADM
TIMM8A
SELENON
C10orf55
F

In [9]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)