In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
Target_list = pd.read_csv('../preprocessed_data/Target_list_selected.csv')['Proteins'].tolist()

In [None]:
def fasta_to_dict(file_path):
    fa_dict = {}
    current_key = None
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                parts = line.split('|')
                if len(parts) >= 2:
                    current_key = parts[1]
                    fa_dict[current_key] = ''
            else:
                if current_key:
                    fa_dict[current_key] += line.replace('\n', '')
    return fa_dict

fasta_dict = fasta_to_dict("../rawdata/uniprotkb_AND_model_organism_9606_2025_04_01.fasta")

In [None]:
df_final_protein = pd.read_csv('../preprocessed_data/Target_map.csv', index_col=None)
df_final_protein_dict = dict(zip(df_final_protein['Gene symbol'], df_final_protein['Ensembl id']))
df_final_protein_name_dict = dict(zip(df_final_protein['Gene symbol'], df_final_protein['Protein name']))

df_final_protein_uniprot = df_final_protein[['Gene symbol', 'UniProt id']].copy().dropna().drop_duplicates()
df_final_protein_uniprot_cleaned = df_final_protein_uniprot[
        df_final_protein_uniprot.groupby('Gene symbol')['Gene symbol'].transform('size') == 1
    ].reset_index(drop=True)

df_final_protein_uniprot_dict = dict(zip(df_final_protein_uniprot_cleaned['Gene symbol'], df_final_protein_uniprot_cleaned['UniProt id']))

In [None]:
ref_df = pd.read_csv('../rawdata/uniprotkb_AND_model_organism_9606_2025_04_01.tsv', sep='\t')

ref_df_r = ref_df[ref_df['Reviewed'] == 'reviewed'].copy()
ref_df_u = ref_df[ref_df['Reviewed'] == 'unreviewed'].copy()

def raw_df_to_ensd_dict(raw_df):

    raw_df['ID'] = raw_df['Ensembl'].apply(
        lambda x: list(set(re.findall(r'(ENSG\d+)', str(x)))) if pd.notna(x) else []
    )

    split_df = raw_df.explode('ID').reset_index(drop=True)[['Entry', 'ID']].dropna()

    split_df_cleaned = split_df[
        split_df.groupby('ID')['ID'].transform('size') == 1
    ].reset_index(drop=True)

    ensg_dict = dict(zip(split_df_cleaned['ID'], split_df_cleaned['Entry']))

    return ensg_dict

ensg_dict_r = raw_df_to_ensd_dict(ref_df_r)
ensg_dict_u = raw_df_to_ensd_dict(ref_df_u)

In [None]:
ref_df_r_gene = ref_df_r[['Entry', 'Gene Names']].copy()
ref_df_r_gene["Gene Names"] = ref_df_r_gene["Gene Names"].str.split()
exploded_df = ref_df_r_gene.explode("Gene Names", ignore_index=True).dropna().drop_duplicates()
exploded_df_cleaned = exploded_df[
    exploded_df.groupby('Gene Names')['Gene Names'].transform('size') == 1
].reset_index(drop=True)
gene_to_uniprotID_dict = dict(zip(exploded_df_cleaned['Gene Names'], exploded_df_cleaned['Entry']))

In [None]:
Gene_Symbol_to_Fasta = {}

for target in Target_list:
    try:
        Gene_Symbol_to_Fasta[target] = fasta_dict[df_final_protein_uniprot_dict[target]]
    except:
        try:
            Gene_Symbol_to_Fasta[target] = fasta_dict[gene_to_uniprotID_dict[target]]
        except:
            try:
                Gene_Symbol_to_Fasta[target] = fasta_dict[ensg_dict_r[df_final_protein_dict[target]]]
            except:
                try:
                    Gene_Symbol_to_Fasta[target] = fasta_dict[ensg_dict_u[df_final_protein_dict[target]]]
                except:
                    print(target, '----------', df_final_protein_dict[target], '----------', df_final_protein_name_dict[target])

In [None]:
Gene_Symbol_to_Fasta['CDKN2A'] = fasta_dict['Q8N726']
Gene_Symbol_to_Fasta['COX2'] = fasta_dict['P00403']
Gene_Symbol_to_Fasta['CUX1'] = fasta_dict['P39880']
Gene_Symbol_to_Fasta['ERCC6'] = fasta_dict['Q03468']
Gene_Symbol_to_Fasta['GSTT2'] = fasta_dict['P0CG30']
Gene_Symbol_to_Fasta['MIR132'] = 'MRGGGFGDRDRDRDRGGFGARGGSGLPPKKFGNPGERLRKKKWDLSELPKFEKNFYVEHP' + \
                                 'EVARLTPYEVDELRRKKEITVRGGDVCPKPVFAFHHANFPQYVMDVLMDQHFTEPTPIQC' + \
                                 'QGFPLALSGRDMVGIAQTGSGKTLAYLLPAIVHINHQPYLERGDGPICLVLAPTRELAQQ' + \
                                 'VQQVADDYGKCSRLKSTCIYGGAPKGPQIRDLERGVEICIATPGRLIDFLESGKTNLRRC' + \
                                 'TYLVLDEADRMLDMGFEPQIRKIVDQIRPDRQTLMWSATWPKEVRQLAEDFLRDYTQINV' + \
                                 'GNLELSANHNILQIVDVCMESEKDHKLIQLMEEIMAEKENKTIIFVETKRRCDDLTRRMR' + \
                                 'RDGWPAMCIHGDKSQPERDWVLNEFRSGKAPILIATDVASRGLDVEDVKFVINYDYPNSS' + \
                                 'EDYVHRIGRTARSTNKGTAYTFFTPGNLKQARELIKVLEEANQAINPKLMQLVDHRGGGG' + \
                                 'GGGGRSRYRTTSSANNPNLMYQDECDRRLRGVKDGGRRDSTSYRDRSETDRASYANGSGY' + \
                                 'GSPNSAFGAQAGQYTYAQGTYGAAAYGTSGYTAQEYAAGTYGASSTASAGRSSQSSSQQF' + \
                                 'SGIGRSGQQPQPLMSQQFAQPPGATNMIGYMGQTAYQYPPPPPPPPPSRK'
Gene_Symbol_to_Fasta['MST1'] = fasta_dict['P26927']
Gene_Symbol_to_Fasta['NRXN2'] = fasta_dict['P58401']
Gene_Symbol_to_Fasta['POLR1D'] = fasta_dict['P0DPB6']
Gene_Symbol_to_Fasta['POU5F1P3'] = fasta_dict['D5K9S6']
Gene_Symbol_to_Fasta['RABGAP1L'] = fasta_dict['Q5R372']
Gene_Symbol_to_Fasta['SEM1'] = fasta_dict['P60896']
Gene_Symbol_to_Fasta['TMPO'] = fasta_dict['P42166']

In [None]:
sorted_Gene_Symbol_to_Fasta = {k: Gene_Symbol_to_Fasta[k] for k in Target_list if k in Gene_Symbol_to_Fasta}

import pickle

with open("../preprocessed_data/Gene_Symbol_to_Fasta.pkl", "wb") as f:
    pickle.dump(sorted_Gene_Symbol_to_Fasta, f)