In [34]:
import requests
import re

# COnfig
## Define Protein Sequences to run here:
protein_sequences = ["P69905", "P68871", "P0A6F5", "P0DTD1", "P00734", "P0A8V2"]

In [35]:
# Using Uniprot API to get protein sequence

def getProtSeq(accession):
    url = f"https://www.uniprot.org/uniprot/{accession}.fasta"
    response = requests.get(url)
    return response.text

# sample
# print(getProtSeq("P69905"))


In [36]:
def format_fasta(seq):
    data = {"header": {}, "sequence": ""}
    header = r'^>(sp|tr)\|([A-Z0-9]+)\|([A-Z0-9_]+)\s+(.+?)\s+OS=(.+?)\s+OX=(\d+)\s+GN=([^\s]+)\s+PE=(\d+)\s+SV=(\d+)'
    filtered_seq = re.match(header, seq)
    if filtered_seq:
        data["header"]["accession"] = filtered_seq.group(2)
        data["header"]["name"] = filtered_seq.group(3)
        data["header"]["description"] = filtered_seq.group(4)
        data["header"]["organism"] = filtered_seq.group(5)
        data["header"]["ox"] = filtered_seq.group(6)
        data["header"]["gene"] = filtered_seq.group(7)
        data["header"]["pe"] = filtered_seq.group(8)
        data["header"]["sv"] = filtered_seq.group(9)
    
        data["sequence"] = seq.splitlines()[1:]
    return data

# format_fasta(getProtSeq("P69905"))


In [None]:
class protein:
    def __init__(self, accession):
        self.accession = accession
        self.sequence = None
        self.header = None

    def fetch_sequence(self):
        fasta_data = getProtSeq(self.accession)
        formatted_data = format_fasta(fasta_data)
        self.sequence = formatted_data["sequence"]
        self.header = formatted_data["header"]

    def __str__(self):
        return f"Protein {self.accession} - {self.header['description']}"
    
    def getNmer(self,n):
        nmer = []
        seq = ''.join(self.sequence)
        seq = seq.replace("\n", "")
        for i in range(len(seq) - n + 1):
            nmer.append(seq[i:i+n])
        return nmer
    
    def getShanonEntropy(self):
        return
    
    def findMotif(self):
        seq = ''.join(self.sequence)
        seq = seq.replace("\n", "")

        motif = input("Enter the regex motif to search for: ")

        matches = re.finditer(motif, seq) # multiple times
        for match in matches:
            start = match.start()
            end = match.end()
            print(f"Match found at position {start} to {end}: {match.group()}")

        return 


In [38]:
protein_objects = [] # This are class objects
for accession in protein_sequences:
    prot = protein(accession)
    prot.fetch_sequence()
    protein_objects.append(prot)

In [39]:

#Example Uasage
print(protein_objects[0].header)
print(protein_objects[0].sequence)


{'accession': 'P69905', 'name': 'HBA_HUMAN', 'description': 'Hemoglobin subunit alpha', 'organism': 'Homo sapiens', 'ox': '9606', 'gene': 'HBA1', 'pe': '1', 'sv': '2'}
['MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG', 'KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP', 'AVHASLDKFLASVSTVLTSKYR']


### Get nMer of the Sequence (Define it below)

In [40]:

nMer_length = 3

for prot in protein_objects:
    print(prot)
    print(prot.getNmer(nMer_length))
    print("\n")

Protein P69905 - Hemoglobin subunit alpha
['MVL', 'VLS', 'LSP', 'SPA', 'PAD', 'ADK', 'DKT', 'KTN', 'TNV', 'NVK', 'VKA', 'KAA', 'AAW', 'AWG', 'WGK', 'GKV', 'KVG', 'VGA', 'GAH', 'AHA', 'HAG', 'AGE', 'GEY', 'EYG', 'YGA', 'GAE', 'AEA', 'EAL', 'ALE', 'LER', 'ERM', 'RMF', 'MFL', 'FLS', 'LSF', 'SFP', 'FPT', 'PTT', 'TTK', 'TKT', 'KTY', 'TYF', 'YFP', 'FPH', 'PHF', 'HFD', 'FDL', 'DLS', 'LSH', 'SHG', 'HGS', 'GSA', 'SAQ', 'AQV', 'QVK', 'VKG', 'KGH', 'GHG', 'HGK', 'GKK', 'KKV', 'KVA', 'VAD', 'ADA', 'DAL', 'ALT', 'LTN', 'TNA', 'NAV', 'AVA', 'VAH', 'AHV', 'HVD', 'VDD', 'DDM', 'DMP', 'MPN', 'PNA', 'NAL', 'ALS', 'LSA', 'SAL', 'ALS', 'LSD', 'SDL', 'DLH', 'LHA', 'HAH', 'AHK', 'HKL', 'KLR', 'LRV', 'RVD', 'VDP', 'DPV', 'PVN', 'VNF', 'NFK', 'FKL', 'KLL', 'LLS', 'LSH', 'SHC', 'HCL', 'CLL', 'LLV', 'LVT', 'VTL', 'TLA', 'LAA', 'AAH', 'AHL', 'HLP', 'LPA', 'PAE', 'AEF', 'EFT', 'FTP', 'TPA', 'PAV', 'AVH', 'VHA', 'HAS', 'ASL', 'SLD', 'LDK', 'DKF', 'KFL', 'FLA', 'LAS', 'ASV', 'SVS', 'VST', 'STV', 'TVL', 'VLT', 'LTS'

In [None]:
# Example usage of finding MOTIF:
prot_seq = protein_objects[0]
prot_seq.findMotif() # Input: P.{3}


Match found at position 4 to 8: PADK
Match found at position 37 to 41: PTTK
Match found at position 44 to 48: PHFD
Match found at position 77 to 81: PNAL
Match found at position 95 to 99: PVNF
Match found at position 114 to 118: PAEF
Match found at position 119 to 123: PAVH
