In [1]:
from Bio import SeqIO

# Initialize dictionaries
prefix_dict = {}
suffix_dict = {}

# Read the FASTA file and process sequences
file_path = "uniprotkb_human_proteome_UP000005640_with_isoforms_2024-10-08.fasta"
total_sequences = sum(1 for _ in SeqIO.parse(file_path, "fasta"))  # Count total sequences
processed_sequences = 0

for record in SeqIO.parse(file_path, "fasta"):
    sequence = str(record.seq).replace('I', 'L')
    protein_id = record.id.split('|')[1]  # Extract protein ID
    gene_name = record.description.split("GN=")[1].split()[0] if "GN=" in record.description else "Unknown"  # Extract gene name

    # Process the sequence into 4mers
    for i in range(len(sequence) - 3):
        kmer = sequence[i:i+4]

        # Update prefix dictionary
        if kmer not in prefix_dict:
            prefix_dict[kmer] = []
        prefix_dict[kmer].append((protein_id, gene_name, len(sequence) - i))

        # Update suffix dictionary
        if kmer not in suffix_dict:
            suffix_dict[kmer] = []
        suffix_dict[kmer].append((protein_id, gene_name, i + 4))  ## PEPTIDE
                                                                  # 0123456
                                                                  # prefix for PTID : 5
                                                                  # suffix for PTID : 6

    # Update progress
    processed_sequences += 1
    progress = (processed_sequences / total_sequences) * 100
    print(f"Processing progress: {progress:.2f}%", end="\r")

print("Processing complete.")

Processing complete. 100.00%


In [2]:
# Print a few examples from suffix_dict
example_kmers = list(suffix_dict.keys())[:5]  # Get the first 5 keys (kmers)
for kmer in example_kmers:
    print(f"Kmer: {kmer}")
    print(f"Proteins: {suffix_dict[kmer][:5]}")  # Print first 5 protein entries for each kmer
    print()

Kmer: MDAA
Proteins: [('A0A087WV00', 'DGKI', 4), ('A6NMZ7', 'COL6A6', 1967), ('O14901', 'KLF11', 90), ('O15085', 'ARHGEF11', 642), ('O43303', 'CCP110', 880)]

Kmer: DAAG
Proteins: [('A0A087WV00', 'DGKI', 5), ('A0A0C3SFZ9', 'FCHO1', 407), ('A6NED2', 'RCCD1', 177), ('E7EUJ1', 'LIPC', 197), ('E7EUK6', 'LIPC', 136)]

Kmer: AAGR
Proteins: [('A0A087WV00', 'DGKI', 6), ('A0A0B4J1S7', 'PTPN22', 436), ('A1L4H1', 'SSC5D', 522), ('A5YKK6', 'CNOT1', 1870), ('A6NC98', 'CCDC88B', 823)]

Kmer: AGRG
Proteins: [('A0A087WV00', 'DGKI', 7), ('A0AV96', 'RBM47', 406), ('A6NFY7', 'SDHAF1', 24), ('A7E2V4', 'ZSWIM8', 1455), ('A8MQ03', 'CYSRT1', 111)]

Kmer: GRGC
Proteins: [('A0A087WV00', 'DGKI', 8), ('A4D0S4', 'LAMB4', 1077), ('A8CG34', 'POM121C', 28), ('B2RXH2', 'KDM4E', 451), ('O60307', 'MAST3', 25)]



In [None]:
import pickle

# Save the dictionaries to a file
with open("Human_proteome_dictionary_I_replaced_by_L.pkl", "wb") as file:
    pickle.dump({"prefix_dict": prefix_dict}, file)

print("Dictionaries saved to Human_proteome_dictionary_I_replaced_by_L.pkl")

Dictionaries saved to dictionaries.pkl
