In [1]:
import pandas as pd

In [2]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s1_s2_genes.csv')

In [3]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [4]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [5]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [6]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(209, 11)
    Gene Symbol  gene_score  p-value    FDR  Odd ratio  \
10        MED28    0.172414  0.00434  0.332      0.234   
24        FEM1A    0.172414  0.00731  0.384      0.264   
47     MAPKAPK5    0.137931  0.01630  0.453      0.258   
14         NFIX    0.137931  0.00545  0.332      0.189   
39        KMT2D    0.137931  0.01310  0.420      0.242   
0        ANTXR2    0.103448  0.00437  0.332      0.115   
126       NACC2    0.103448  0.07710  0.458      0.341   
128       ORAI2    0.103448  0.12000  0.458      0.412   
31        MPRIP    0.103448  0.00978  0.405      0.154   
34       ZNF621    0.103448  0.00880  0.405      0.148   

     Number of interactions        microRNA 1       microRNA 2  \
10                        5      hsa-miR-3178     hsa-miR-4439   
24                        5      hsa-miR-4472     hsa-miR-4439   
47                        4   hsa-miR-6895-3p     hsa-miR-4439   
14                        4  hsa-miR-6769b-5p  hsa-miR-4749-5p   
39                   

In [7]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

MED28
FEM1A
MAPKAPK5
NFIX
KMT2D
ANTXR2
NACC2
ORAI2
MPRIP
ZNF621
ZNF708
ARHGAP18
FOXC1
TXNIP
ZNF701
C17orf105
AGO2
DDR2
BARHL1
CBARP
KDM6B
PANK1
NDOR1
SCD
MTMR6
MRNIP
SPPL2A
KLF2
RREB1
SCAMP4
SENP2
QSOX2
R3HDM4
RYBP
RAB11B
RPP25
RBMS2
RCC2
RICTOR
RNF165
PTCHD1
RNF40
RPL14
PYGO1
PKD1
PTBP1
MTX3
MEF2D
MEX3A
MRPS10
MRPS23
MTDH
MTHFD1
MTHFD2
MTPN
NUCB1
PRRC2B
PDE4D
PEG10
PIGP
SETD1B
PLA2G2C
PLEKHA3
POLA2
PRPS1L1
SERTAD2
TM9SF3
SLC25A45
LDLR
ZNF585B
ZNF787
ZNF799
ZNF8
PHAX
TSC22D2
RBM23
DCAF7
HMGN2
SF3B3
ZNF525
YY1
NUDT3
YIPF4
PLAGL2
MKNK2
SKI
MIDN
XIAP
SLC7A5
ZNF562
ZNF440
SMG1
TMEM184B
SMOC1
SPART
SPRY4
TCF3
TET3
THSD4
MACC1
TMED4
TMEM170A
TRA2B
ZNF347
TRIM37
TXLNA
ULBP3
ULK1
UNC5B
YARS
YPEL2
ZBTB46
ZNF106
MAP2K7
KCTD15
LSM14A
ACOX1
SH2D5
TOMM5
AP5M1
BRD2
SIX5
TRPC4AP
ZBTB47
ABCC6
ACACA
ADM
CHCHD3
AKR7A2
AKT1S1
ANKRD52
ANP32B
ANP32E
AP2M1
ASB6
ATXN2
B4GALT5
MOB1A
TMEM54
LRTOMT
RFC5
FBXL4
FBXO33
FKRP
INO80B
KAT6B
KIAA0040
MICALL1
RBBP5
RBX1
SCRT1
PDZD4
TULP1
UPF3A
SSX5
F8A2
F8A3
FGF1
FMO4
G

In [8]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)