In [3]:
import pandas as pd

In [4]:
ranked_features = pd.read_csv('./miRNA_disease_ranked.csv')
gene_targets = pd.read_csv('./s2_s3_genes.csv')

In [5]:
ranked_features['miRNA_rank'] = ranked_features['Importance'].rank()
miRNA_ranks = ranked_features['miRNA_rank'].to_dict()

In [6]:
total_miRNA = len(ranked_features)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [7]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [8]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

(3242, 13)
     Gene Symbol  gene_score   p-value    FDR  Odd ratio  \
908        MED28    0.125000  0.004590  0.592      0.312   
275        CELF1    0.125000  0.005260  0.592      0.320   
0         OTUD7B    0.107143  0.000970  0.539      0.201   
2           PIGO    0.107143  0.000653  0.539      0.187   
1240       RAD51    0.107143  0.003950  0.592      0.264   
9         KDELR1    0.107143  0.001660  0.585      0.223   
1038       NUDT3    0.107143  0.012400  0.592      0.334   
1421         SKI    0.107143  0.021600  0.592      0.377   
1391      SETD1B    0.089286  0.015600  0.592      0.304   
1588      TIMM8A    0.089286  0.010600  0.592      0.276   

      Number of interactions        microRNA 1       microRNA 2  \
908                        7   hsa-miR-6829-3p  hsa-miR-4722-3p   
275                        7   hsa-miR-1229-3p   hsa-miR-412-3p   
0                          6   hsa-miR-1229-3p  hsa-miR-371b-5p   
2                          6   hsa-miR-371b-5p  hsa-miR-6829

In [12]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(1001):
    if str(top_genes[i]) != 'nan':
        # print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        print(ranked_genes.iloc[i,0])

MED28
CELF1
OTUD7B
PIGO
RAD51
KDELR1
NUDT3
SKI
SETD1B
TIMM8A
PEX26
WIPF2
OCIAD2
KMT2D
FOXK1
HNRNPA1
KLHL36
ZNF460
BMPR1A
TMEM154
BCL2L1
HMGA1
IPP
SAMD8
PKD1
C12orf49
SFT2D2
HMGN2
IKZF3
FEM1A
FOSL2
MIDN
GIGYF1
GATA6
PARP15
KIF3A
TOR4A
PARD6B
CYP20A1
MMAB
SRARP
ZNF445
MAPK8
MAPKAPK2
PAK4
SCOC
MAFK
LRRC3C
LRIG2
ZNF264
COX6B1
RNF40
CALR
YOD1
ORC6
PAICS
DCAF7
KLHL11
KCNMB1
KLF2
NACC1
GTF2H5
KIF5B
MBOAT2
KIAA1456
MAP3K2
IMP4
KCNK5
INAFM1
ITGA2
LLGL1
LAX1
HNRNPUL1
GSR
KNSTRN
HSPE1-MOB4
SLC7A5
FOXJ3
GRB2
CLUAP1
DEFB105A
DDX3X
DDX19B
DCTN4
AGO2
CSE1L
CRISPLD2
COL1A1
CLPB
DNAL1
CLIC5
CHERP
CES2
CDC27
CCS
CCPG1
CAPZA2
CAPZA1
DEFB105B
DOT1L
GPC4
FBXL18
GLG1
GK5
GJD3
GAPVD1
FSCN1
FHL2
FGF19
FBXO47
FAM241A
E2F2
FAM229B
FAM153B
FAM151B
EXTL3
ETNK1
ENPP5
KLHL15
EIF5AL1
MEX3A
OTUB1
MKI67
MOB4
TRAF3IP1
TNPO2
TMOD3
P2RY2
TMEM44
TMEM151A
TLN1
TFDP2
TEF
TAF8
SYAP1
SULT1B1
SSBP2
SRF
SPRYD4
SP2
SORD
SON
SMYD1
TRAF7
TRAPPC2
UTP6
ZNF417
CNBP
TUBB2A
ZSWIM4
ZNRF3
ZNF860
ZNF8
ZNF641
ZNF556
ZNF347
VHL
ZNF195
ZMAT3

In [10]:
# Save genes and gene score columns to csv
ranked_genes[['Gene Symbol', 'gene_score']].to_csv('final_ranked_genes.csv', index=False)