In [1]:
import pandas as pd

In [2]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s1_s2_genes.csv')

In [3]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [4]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [5]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [6]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(1314, 11)
    Gene Symbol  gene_score  p-value    FDR  Odd ratio  \
10        MED28    0.172414  0.00434  0.332      0.234   
24        FEM1A    0.172414  0.00731  0.384      0.264   
47     MAPKAPK5    0.137931  0.01630  0.453      0.258   
14         NFIX    0.137931  0.00545  0.332      0.189   
39        KMT2D    0.137931  0.01310  0.420      0.242   
478      SPPL2A    0.103448  0.02540  0.458      0.219   
209       FOXC1    0.103448  0.02100  0.458      0.204   
333       NACC2    0.103448  0.07710  0.458      0.341   
591      ZNF708    0.103448  0.03720  0.458      0.254   
31        MPRIP    0.103448  0.00978  0.405      0.154   

     Number of interactions        microRNA 1       microRNA 2  \
10                        5      hsa-miR-3178     hsa-miR-4439   
24                        5      hsa-miR-4472     hsa-miR-4439   
47                        4   hsa-miR-6895-3p     hsa-miR-4439   
14                        4  hsa-miR-6769b-5p  hsa-miR-4749-5p   
39                  

In [10]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(1001):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

MED28
FEM1A
MAPKAPK5
NFIX
KMT2D
SPPL2A
FOXC1
NACC2
ZNF708
MPRIP
ZNF621
ANTXR2
ORAI2
MRNIP
ZNF701
KDM6B
C17orf105
TXNIP
KLF2
DDR2
SCD
AGO2
ARHGAP18
BARHL1
CBARP
PANK1
NDOR1
MTMR6
NUCB1
POLA2
MEX3A
MEF2D
MAP2K7
PRPS1L1
MACC1
PLA2G2C
PRRC2B
LSM14A
PLEKHA3
MRPS10
PKD1
PIGP
PDE4D
MRPS23
MTDH
MTHFD1
MTHFD2
MTPN
MTX3
LPP
PEG10
LRTOMT
IL17REL
LETMD1
KREMEN1
DCP2
DDX39B
DISC1
EBNA1BP2
EIF4EBP2
ELFN2
ELP2
FAM126B
FAM129B
FAM136A
FLVCR1
FSTL4
GATAD2A
GDI1
GOLGA8A
GOLGA8B
GPAT4
GRB2
NACC1
HIST1H2AH
SLC7A5
ICOSLG
PTCHD1
IL2RA
KCTD15
KIAA1456
KLHL12
PTBP1
RPP25
PYGO1
ZNF787
SKI
ULBP3
ULK1
UNC5B
YARS
YPEL2
ZBTB46
ZNF106
ZNF347
ZNF440
ZNF525
ZNF562
ZNF585B
ZNF799
QSOX2
ZNF8
PHAX
TSC22D2
RBM23
MKNK2
DCAF7
PLAGL2
YIPF4
HMGN2
NUDT3
LDLR
YY1
SF3B3
TXLNA
TRIM37
TRA2B
TMEM184B
R3HDM4
RAB11B
RBMS2
RCC2
RICTOR
RNF165
RNF40
RPL14
CYP4F11
RREB1
RYBP
SCAMP4
SENP2
SERTAD2
SETD1B
SLC25A45
SMG1
SMOC1
SPART
SPRY4
MIDN
TCF3
TET3
THSD4
TM9SF3
TMED4
TMEM170A
DBN1
XIAP
F8A2
RFC5
SCRT1
ADM
CD276
CHAC1
CHERP
KAT6B
TULP1
I

In [8]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)