In [2]:
import pandas as pd

In [3]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s3_s4_genes.csv')

In [4]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [5]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [6]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [7]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(2669, 13)
     Gene Symbol  gene_score  p-value   FDR  Odd ratio  \
492       GIGYF1    0.134615   0.0095  0.61      0.359   
1410       TXNIP    0.115385   0.0220  0.61      0.380   
1369      TNRC6B    0.115385   0.0123  0.61      0.335   
837         NFIC    0.115385   0.0431  0.61      0.444   
703        LMNB2    0.115385   0.0233  0.61      0.385   
1470       YWHAZ    0.096154   0.0552  0.61      0.424   
456        FOXK1    0.096154   0.1100  0.61      0.518   
1123       SBNO1    0.096154   0.0132  0.61      0.292   
712       LRRC58    0.096154   0.0438  0.61      0.398   
766         MIDN    0.096154   0.0609  0.61      0.436   

      Number of interactions        microRNA 1        microRNA 2  \
492                        7   hsa-miR-6734-3p      hsa-miR-4516   
1410                       6   hsa-miR-520d-3p      hsa-miR-3975   
1369                       6  hsa-miR-29b-1-5p  hsa-miR-6715b-5p   
837                        6   hsa-miR-4727-5p   hsa-miR-6756-5p   
703       

In [8]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(1001):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

GIGYF1
TXNIP
TNRC6B
NFIC
LMNB2
YWHAZ
FOXK1
SBNO1
LRRC58
MIDN
GLO1
ULK1
UNC5B
YIPF4
DGAT2
ZBTB33
MAP3K2
ATP5A1
FEM1A
FGFR1OP
KIAA1191
KIAA0513
MAFK
TRAF6
UBE2H
IGF1R
REST
RAB5B
CRY2
RUFY2
PTBP1
SELENON
SF3B3
DNAJC10
PAFAH1B2
PABPN1
ORAI2
NUDT3
TAOK1
ELFN2
NFIX
CDKN1A
C16orf58
AGO2
ACOX1
ABI2
HMGN2
CCNT1
ACACA
NEK7
NFIB
CAPZA2
TAF1D
SYT7
DSN1
STAT3
NUCB1
NUMBL
C6orf223
NUP62
CCDC198
SPRY4
CCDC80
DUSP2
NACC2
ENPP5
PEA15
MINK1
HIST2H2BE
UBXN2A
FAM102B
UBE2G1
C10orf76
C11orf54
F2RL3
TSR1
TRIOBP
TMEM184A
MKNK2
TP53
MLLT1
MNT
C18orf32
TMEM241
C2orf48
TMEM214
SLC6A4
GATA6
MEX3A
DCAF7
ZNF460
SBF1
PTPN14
SAMD8
PVR
RPRD2
RPL17-C18orf32
CRK
CSNK2A1
SCOC
RNF11
RLIM
CUL3
RGMB
RFC2
RDH11
RBMS2
RBM23
SCAMP4
PTP4A1
PLCE1
PPARGC1B
PLEKHM1
SLC12A7
SIK1
POLR1B
CEP97
SHOC2
SHISA9
SH3GLB1
CHEK2
PTMA
SGPL1
PPM1F
PRIM1
SETD1B
PRRC2B
PRRG4
CLPB
PTGES3L
CDH6
RBM12B
KMT2A
AVL9
AKR7A2
KREMEN1
MDFI
FZR1
FZD9
HOOK3
ZNF394
ZWINT
MCFD2
ZBTB46
ANP32B
FOLR1
XIAP
FAM83F
FAM83H
FOXC1
ZNF385A
LSM14A
ATP1B3
ZBTB7A
FAM212B


In [9]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)