In [1]:
import pandas as pd

In [2]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s3_s4_genes.csv')

In [3]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [4]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [5]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [6]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(659, 13)
    Gene Symbol  gene_score  p-value   FDR  Odd ratio  Number of interactions  \
189      GIGYF1    0.134615  0.00950  0.61      0.359                       7   
517      TNRC6B    0.115385  0.01230  0.61      0.335                       6   
263       LMNB2    0.115385  0.02330  0.61      0.385                       6   
532       TXNIP    0.115385  0.02200  0.61      0.380                       6   
315        NFIC    0.115385  0.04310  0.61      0.444                       6   
172       FOXK1    0.096154  0.11000  0.61      0.518                       5   
551       YWHAZ    0.096154  0.05520  0.61      0.424                       5   
287        MIDN    0.096154  0.06090  0.61      0.436                       5   
193        GLO1    0.096154  0.00897  0.61      0.266                       5   
266      LRRC58    0.096154  0.04380  0.61      0.398                       5   

           microRNA 1        microRNA 2        microRNA 3        microRNA 4  \
189   hsa-miR-6734-

In [7]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(ranked_genes)):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

GIGYF1
TNRC6B
LMNB2
TXNIP
NFIC
FOXK1
YWHAZ
MIDN
GLO1
LRRC58
SBNO1
FEM1A
KIAA1191
MAP3K2
MAFK
DNAJC10
KIAA0513
FGFR1OP
IGF1R
HMGN2
NFIX
ELFN2
DGAT2
PABPN1
NUDT3
C16orf58
CDKN1A
ZBTB33
YIPF4
UNC5B
ULK1
UBE2H
TRAF6
TAOK1
SF3B3
SELENON
RUFY2
REST
RAB5B
PTBP1
PAFAH1B2
CRY2
ORAI2
ABI2
AGO2
ACOX1
ATP5A1
ZNF394
FAM83F
FAM83H
ZNF641
ZNF585B
ZNF451
FEM1C
ZNF385A
FAM212B
FOLR1
FOXC1
ZBTB7A
ZBTB46
SLC6A4
FZD9
FZR1
ZBTB37
NACC2
RBMS2
GATA6
SH3GLB1
BCL2L1
NUP62
NUMBL
SGPL1
NUCB1
DSN1
DUSP2
SHISA9
F2RL3
SHOC2
SIK1
NFIB
ENPP5
KIAA1456
ZWINT
SLC12A7
FAM102B
MNT
SETD1B
HIST2H2BE
LZIC
LSM14A
HOOK3
LRTOMT
TMEM241
TMEM214
TMEM184A
AVL9
STAT3
INO80D
SYT7
TAF1D
KREMEN1
KMT2A
KDELR1
ATP1B3
RDH11
MLLT1
SPRY4
MKNK2
WSB1
MINK1
ACACA
MEX3A
UBXN2A
METTL8
UBE2G1
METTL14
MDM2
TSR1
TRIOBP
MDFI
MCFD2
TP53
AHCYL2
NEK7
NACC1
CAPZA2
PLCE1
SBF1
PLEKHM1
C2orf48
POLR1B
SOD2
CLPB
RLIM
SAMD8
PPARGC1B
RNF11
PPM1F
PRIM1
CHEK2
C18orf32
CEP97
PVR
PRRC2B
CCDC198
CDH6
PRRG4
RPL17-C18orf32
CCDC80
PTGES3L
PTMA
PTP4A1
RPRD2
CCNT1
AKR7

In [8]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)