In [3]:
import pandas as pd

In [4]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./ctl_s1_genes.csv')

In [5]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [6]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [7]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [8]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(1215, 17)
    Gene Symbol  gene_score   p-value     FDR  Odd ratio  \
69        NACC1    0.132530  0.002970  0.1440      0.400   
113        NFIC    0.120482  0.008050  0.2370      0.430   
21       SETD1B    0.108434  0.000303  0.0472      0.253   
14       SCAMP4    0.108434  0.000183  0.0409      0.237   
60        KHSRP    0.108434  0.002370  0.1290      0.336   
5          CALR    0.108434  0.000054  0.0226      0.203   
393      CDKN1A    0.108434  0.077300  0.5190      0.594   
17        SPRY4    0.108434  0.000243  0.0453      0.246   
76          PKM    0.096386  0.003380  0.1470      0.323   
16          MNT    0.096386  0.000233  0.0453      0.216   

     Number of interactions       microRNA 1       microRNA 2  \
69                       11  hsa-miR-6766-5p  hsa-miR-6756-5p   
113                      10  hsa-miR-4690-5p  hsa-miR-6766-5p   
21                        9  hsa-miR-3940-5p  hsa-miR-4758-5p   
14                        9  hsa-miR-6787-5p     hsa-miR-663a   
60 

In [9]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

NACC1
NFIC
SETD1B
SCAMP4
KHSRP
CALR
CDKN1A
SPRY4
PKM
MNT
SLC47A1
ANKRD52
MLLT1
NPR1
ABL1
ASB6
SLC7A5
NFIX
SFN
ARHGDIA
GATAD2A
SBF1
BCL2L1
CTDNEP1
RAB1B
ARL8A
METTL14
DSN1
ADGRL1
TPM3
LARP1
CRISPLD2
HDGF
ZNF385A
YWHAZ
CCNT1
CBX6
PEX26
ABI2
ALDOA
TP53
COX6B1
KMT2D
LRRC58
H2AFX
SRCIN1
UBE2Q1
AHDC1
STMN3
LSG1
C3orf36
NEUROD2
ARHGAP31
SMCR8
TLN1
ANKRD13B
RAB11B
RPS6KA4
HSP90AB1
EIF5AL1
ZNF207
UNK
SLC35C2
ZSWIM1
SENP2
ZNF556
FURIN
AP2M1
ZNF451
SNRPD1
YPEL2
AGO2
SELENON
SUMO1
MED28
ONECUT3
RHOB
PTPRF
ZNF787
PTPA
PHLDA3
KDM6B
MCFD2
IP6K1
PRRC2B
SUSD6
MAFK
ABHD12
PDE4C
MYADM
MSN
NRGN
GIGYF1
GNAI2
LMNB2
TPCN2
GDI1
FBRS
HCFC1
CAPN15
TMEM50A
ORAI2
ARHGAP39
BARHL1
KMT2A
PTPN9
KCNK3
SF3B3
NPLOC4
NUP62
SYNGR1
VAV3
DHTKD1
PNPLA6
LIMD1
FOXK1
ZNF516
COL4A3BP
REPIN1
PCGF3
ANP32B
CASP16P
RABGAP1L
CNNM4
PRRT2
GRWD1
PMPCA
TOR4A
TSPAN14
HOXB6
SBK1
STK4
PRICKLE1
PGAM1
JARID2
E2F6
CNKSR3
YY1
CDH7
FXN
TRIM65
ZNF703
C16orf58
TNRC6B
C12orf49
GPRC5A
SERBP1
TMEM167A
YWHAE
CNBP
KLHDC10
HMGB1
ZNF652
SPATA2
CRCP
CLPB


In [10]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)