In [None]:
import pandas as pd

In [None]:
results_df = pd.read_csv('./deseq2_feature_ranking.csv')
gene_targets = pd.read_csv('./gene_targets.csv')

In [None]:
results_df['miRNA_rank'] = results_df['padj'].rank()
miRNA_ranks = results_df['miRNA_rank'].to_dict()

In [None]:
total_miRNA = len(results_df)

*Formula*:
score_gene= 1/t∑_{i=1}^{t}(1−rank_{miRNA}^{i}/total_{miRNA}) 

*Explanation*
- t: the number of miRNAs that target the gene
- rank_{miRNA}^{i}: the rank of miRNA i that targets the gene
- total_{miRNA}: the total number of miRNAs that target the gene

***High Gene Score***:
- Indicates that the gene interacts with miRNAs that are highly ranked (i.e., have a low padj value).
- A high score suggests that the gene is likely to be regulated by miRNAs that are significant in the context of differential expression analysis.
- Thus, genes with high scores are potentially more important or relevant to the biological conditions being studied.

***Low Gene Score***:
- Indicates that the gene interacts with miRNAs that are lower ranked (i.e., have a higher padj value).
- A low score suggests that the gene is less likely to be regulated by significant miRNAs.
- Such genes may be of lower priority for further investigation.

In [None]:
def calculate_gene_score(row, miRNA_ranks, total_miRNA):
    total_interactions = 0
    score = 0
    for miRNA in row.index[4:]:  # Assuming the miRNAs start from the 5th column
        if pd.notna(row[miRNA]) and row[miRNA] in miRNA_ranks:
            rank = miRNA_ranks[row[miRNA]]
            score += (1 - (rank / total_miRNA))
            total_interactions += 1
    if total_interactions > 0:
        return score / total_interactions
    else:
        return 0

In [None]:
gene_targets['gene_score'] = gene_targets.apply(calculate_gene_score, axis=1, miRNA_ranks=miRNA_ranks, total_miRNA=total_miRNA)

# Sort genes based on their scores in descending order
ranked_genes = gene_targets.sort_values(by='gene_score', ascending=False)

# Reorder columns so that 'gene_score' is the second column
columns = ['Gene Symbol', 'gene_score'] + [col for col in ranked_genes.columns if col not in ['Gene Symbol', 'gene_score']]
ranked_genes = ranked_genes[columns]


# Save ranked genes to a CSV file
ranked_genes.to_csv('ranked_genes.csv', index=False)

print(ranked_genes.shape)
# Display top ranked genes
print(ranked_genes.head(10))

In [42]:
# get top 50 genes
top_genes = ranked_genes.iloc[:,1].tolist()
# print the row and the score as a string on a new line
for i in range(len(top_genes)):
    if str(top_genes[i]) != 'nan':
        print(ranked_genes.iloc[i,0], ranked_genes.iloc[i,1], sep = '\t')
        # print(ranked_genes.iloc[i,0])

CNBP	0.9962962962962963
MKNK2	0.9962962962962963
CDCP1	0.9828460038986355
KLHL21	0.9828460038986355
ZNF566	0.9828460038986355
SH3TC2	0.9828460038986355
C12orf49	0.9828460038986355
TMEM167A	0.9828460038986355
ZNF207	0.9828460038986355
ATXN7L3B	0.9828460038986355
HSD17B12	0.9828460038986355
FAM83F	0.9828460038986355
ZNF394	0.9828460038986355
MAZ	0.9828460038986355
KLHDC10	0.9828460038986355
SMC1A	0.9828460038986355
TAF8	0.9818713450292398
PRKAR2A	0.9818713450292398
TMEM2	0.9818713450292398
PIGO	0.9818713450292398
KIAA0513	0.9818713450292398
GAPVD1	0.9818713450292398
TRIM71	0.9818713450292398
GSR	0.9818713450292398
DDX19B	0.9818713450292398
NF2	0.9818713450292398
WEE1	0.9801169590643275
MAPK1	0.9801169590643275
PLAGL2	0.9801169590643275
TFAM	0.9744639376218324
PYGO1	0.9744639376218324
SLC6A8	0.9744639376218324
NIPA1	0.9744639376218324
GDI1	0.9744639376218324
BCL2L2	0.9744639376218324
PDGFRA	0.9744639376218324
HINT1	0.9744639376218324
CACUL1	0.9744639376218324
PPM1F	0.9744639376218324
PTAR