In [1]:
import re

original = "../../datasets/doggy_data/test_ft_output_1000"

with open(original, 'r') as file:
    lines = file.readlines()
    tags = []
    for line in lines:
        match = re.search(r'<([^>]+)>', line)
        if match:
            tags.append(match.group(1))

print(set(tags))



{'heavy', 'light_kappa', 'light_lambda'}


In [3]:
import requests

class annotate():
    """
    class `annotate`. 
    
    Initiator `__init__` has 2 parameters:
    
    :param aaseq: STRING: A single-letter, amino acid sequence corresponding to the complete VH or VL chain. Both uppercase and lowercase are accepted. 
    
    :param scheme: STRING: "kabat", "chothia", "contact", or "imgt". Must be in lowercase
    
    Class has 3 methods. `retrieve()`: retrieves numbered seqs from Abnum website, then sends it to method `analyze` to determine the FR and CDR regions, and to `output() ` to print the result and return a list of 2 dictionaries, the first of which contains to region:seq pairs, the second of which contains number:residue pairs. 
    
    """
    
    def __init__(self, aaseq, scheme):
        
        self.aaseq=aaseq
        self.scheme=scheme
    
    def __repr__(self):
        return "Annotation of VH or VL sequence using Kabat, Chothia, Contact, or IMGT scheme"
    
    def output(self, chain, lst, regionlst):
        """
        Prints the FR and CDR regions and their corresponding seq. It returns a `list` of 2 `dict`. 
        
        :param chain: STRING, either "H" or "L" in uppercase
        :param lst:  LIST, a list of residue and their corresponding numbers in kabat or chothia scheme
        :param regionlst: LIST, a list of peptides, each corresponds to a FR or CDR region
        :return: LIST, a list of 2 `dict`, The first dict consists of region: seq pairs. The second dict consists of number:residue pairs.
        
        """
        
        self.chain=chain
        self.lst=lst
        self.regionlst=regionlst

        self.regiondict, self.numberdict={}, {}
        
        for i in range (0, len(self.lst), 2):
            self.numberdict[self.lst[i]]=self.lst[i+1]
        
        
        if self.scheme=="kabat":
            print("Annotation in Kabat scheme:")
        elif self.scheme=="chothia":
            print("Annotation in Chothia scheme:")
        elif self.scheme=="contact":
            print("Annotation in Contact scheme:")
        else:
            print("Annotation in IMGT scheme:")
        
        if self.chain=="L":
            print("L-FR1:  ", self.regionlst[0])
            print("L-CDR1: ", self.regionlst[1])
            print("L-FR2:  ", self.regionlst[2])
            print("L-CDR2: ", self.regionlst[3])
            print("L-FR3:  ", self.regionlst[4])
            print("L-CDR3: ", self.regionlst[5])
            print("L-FR4:  ", self.regionlst[6])
            
            for region, seq in zip(["L-FR1", "L-CDR1", "L-FR2","L-CDR2", "L-FR3", "L-CDR3", "L-FR4"], self.regionlst):
                self.regiondict[region]=seq
            
            return [self.regiondict, self.numberdict]
                
        else:
            print("H-FR1:  ", self.regionlst[0])
            print("H-CDR1: ", self.regionlst[1])
            print("H-FR2:  ", self.regionlst[2])
            print("H-CDR2: ", self.regionlst[3])
            print("H-FR3:  ", self.regionlst[4])
            print("H-CDR3: ", self.regionlst[5])
            print("H-FR4:  ", self.regionlst[6])
            
            for region, seq in zip(["H-FR1", "H-CDR1", "H-FR2","H-CDR2", "H-FR3", "H-CDR3", "H-FR4"], self.regionlst):
                self.regiondict[region]=seq
            
            return [self.regiondict, self.numberdict]
            
        
        
        

    
    def analyze(self,chain, lst):
        """
        Define CDR and FR regions based on the numbered sequence returned from website
        
        :param chain: STRING, "H" or "L" in uppercase
        :param lst: LIST, a list of residue and their corresponding numbers in kabat or chothia scheme
        :return: LIST, a list of strings, where each string is a peptide corresponding to the a region, in the order of: FR1, CDR1, FR2, CDR2, FR3, CDR3, FR4
        
        :raises: `ValueError` if any of the FR or CDR region is missing
        
        """
        
        self.chain=chain
        self.lst=lst
        if self.chain=="L":
            self.L_FR1, self.L_CDR1, self.L_FR2, self.L_CDR2, self.L_FR3, self.L_CDR3, self.L_FR4=["" for i in range (0, 7)]
            
            try:
                if self.scheme in ["kabat", "chothia"]:
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L24"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L24"), self.lst.index("L35"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L35"), self.lst.index("L50"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L50"), self.lst.index("L57"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L57"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L98"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L98"), len(self.lst), 2)])
                                    
                elif self.scheme =="contact": 
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L30"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L30"), self.lst.index("L37"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L37"), self.lst.index("L46"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L46"), self.lst.index("L56"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L56"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L97"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L97"), len(self.lst), 2)])
                                    
                else: #IMGT scheme
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L27"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L27"), self.lst.index("L33"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L33"), self.lst.index("L50"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L50"), self.lst.index("L52"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L52"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L98"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L98"), len(self.lst), 2)])
                
                return [self.L_FR1, self.L_CDR1, self.L_FR2, self.L_CDR2, self.L_FR3, self.L_CDR3, self.L_FR4] 

            except ValueError:
                print("Unable to retrieve complete V region. Make sure the sequence has complete V region")
            except:
                print("An error occured")
        else:
            self.H_FR1, self.H_CDR1, self.H_FR2, self.H_CDR2, self.H_FR3, self.H_CDR3, self.H_FR4=["" for i in range (0, 7)]
            try:
                if self.scheme=="kabat":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H31"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H31"), self.lst.index("H36"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H36"), self.lst.index("H50"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H50"), self.lst.index("H66"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H66"), self.lst.index("H95"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H95"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])            
            
                elif self.scheme=="chothia":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H26"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H26"), self.lst.index("H33"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H33"), self.lst.index("H52"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H52"), self.lst.index("H57"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H57"), self.lst.index("H95"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H95"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])

                elif self.scheme=="contact":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H30"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H30"), self.lst.index("H36"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H36"), self.lst.index("H47"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H47"), self.lst.index("H59"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H59"), self.lst.index("H93"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H93"), self.lst.index("H102"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H102"), len(self.lst), 2)])
                                        
                else: #IMGT scheme
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H26"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H26"), self.lst.index("H34"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H34"), self.lst.index("H51"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H51"), self.lst.index("H58"), 2)]) #51>57 (instead of 56)
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H58"), self.lst.index("H93"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H93"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])                    
                
                return [self.H_FR1, self.H_CDR1, self.H_FR2, self.H_CDR2, self.H_FR3, self.H_CDR3, self.H_FR4]                    

            except ValueError:
                print("Unable to retrieve complete V region. Make sure the sequence has complete V region")
            except:
                print("An error occured in the `analyze()` method")
        
    def retrieve (self):
        """
        Retrieve numbered residues from Abnum website
        
        :return: returns same object from the `output()` method. 
        
        :raises: `ValueError` if input scheme is not among "kabat", "chothia", "contact", and "imgt"
        
        """
        
        self.url="http://www.bioinf.org.uk/abs/abnum/abnum.cgi"
        
        try: 
            if self.scheme not in ["kabat", "chothia", "contact", "imgt", "martin"]:
                raise Exception
            
        except ValueError:
            print("Incorrect scheme mode. Must be one of the following (lowercase): kabat, chothia, contact, imgt")
        
        else:
            if self.scheme=="kabat":
                self.sche="-k"
            else:
                self.sche="-c"
        
        try:
            self.d={"plain":1, "scheme":self.sche, "aaseq":self.aaseq}
            self.myPage=requests.get(self.url, params=self.d)
            self.text=self.myPage.text
            self.lst=self.text.split()
                
            if len(self.lst)>1:
                self.chain=self.lst[0][0]
                self.result=self.output(self.chain, self.lst, self.analyze(self.chain, self.lst))
                return self.result
            else:
                print("No annotation retrieved. Did you enter the complete VH or VL sequence?")
        except:
            print("An error occured in the `retrieve()` method")

In [13]:
import os
import contextlib
from tqdm import tqdm

scheme = "martin"

misaligned = 0
de_novo_heavy = 0
de_novo_light = 0
de_novo_errors = 0
natural_heavy = 0
natural_light = 0


with open('../../sample_outputs/2024_10_05__18_18_59/t5_baseline_base_ft_full_greedy_samples_processed', 'r') as file:
    lines = file.readlines()
    for i in tqdm(range(0, len(lines), 3)):
        seq_natural_original = lines[i].strip()
        seq_de_novo_original = lines[i+1].strip()
        
        seq_natural = annotate(seq_natural_original, scheme)
        seq_de_novo = annotate(seq_de_novo_original, scheme)

        with open(os.devnull, 'w') as devnull:
            with contextlib.redirect_stdout(devnull):
                result_de_novo = seq_de_novo.retrieve()
                result_natural = seq_natural.retrieve()

        # print("De Novo Result:", result_de_novo)
        # print("Natural Result:", result_natural)

        if 'H-CDR1' in result_natural[0]:
            natural_heavy += 1
        else:
            natural_light += 1

        try:
            if 'H-CDR1' in result_de_novo[0]:
                de_novo_heavy += 1
            else:
                de_novo_light += 1

            # check if H-CDR1 exists, if not assume we need to use the L chain
            if 'H-CDR1' in result_natural[0]:
                cdr_1_natural = result_natural[0]['H-CDR1']
                cdr_2_natural = result_natural[0]['H-CDR2']
                cdr_3_natural = result_natural[0]['H-CDR3']

                cdr_1_de_novo = result_de_novo[0]['H-CDR1']
                cdr_2_de_novo = result_de_novo[0]['H-CDR2']
                cdr_3_de_novo = result_de_novo[0]['H-CDR3']
                # Check if CDRs are the same
                cdr1_same = result_de_novo[0]['H-CDR1'] == result_natural[0]['H-CDR1']
                cdr2_same = result_de_novo[0]['H-CDR2'] == result_natural[0]['H-CDR2']
                cdr3_same = result_de_novo[0]['H-CDR3'] == result_natural[0]['H-CDR3']

            else:
                cdr_1_natural = result_natural[0]['L-CDR1']
                cdr_2_natural = result_natural[0]['L-CDR2']
                cdr_3_natural = result_natural[0]['L-CDR3']

                cdr_1_de_novo = result_de_novo[0]['L-CDR1']
                cdr_2_de_novo = result_de_novo[0]['L-CDR2']
                cdr_3_de_novo = result_de_novo[0]['L-CDR3']
                # Check if CDRs are the same
                cdr1_same = result_de_novo[0]['L-CDR1'] == result_natural[0]['L-CDR1']
                cdr2_same = result_de_novo[0]['L-CDR2'] == result_natural[0]['L-CDR2']
                cdr3_same = result_de_novo[0]['L-CDR3'] == result_natural[0]['L-CDR3']
            

            # print(f"CDR1 same: {cdr1_same}")
            # print(f"CDR2 same: {cdr2_same}")
            # print(f"CDR3 same: {cdr3_same}")
            if not (cdr1_same and cdr2_same and cdr3_same):
                print("Misaligned")
                print("De Novo Sequence:", seq_de_novo_original)
                print("Natural Sequence:", seq_natural_original)
                print("de novo CDRs: ", cdr_1_de_novo, " ", cdr_2_de_novo, " ", cdr_3_de_novo)
                print("natural CDRs: ", cdr_1_natural, " ", cdr_2_natural, " ", cdr_3_natural)
                misaligned += 1
        except:
            de_novo_errors += 1
            print("Error in de novo sequence")
            print("Sequence:", seq_de_novo_original)
            misaligned += 1

print(misaligned)
print(de_novo_errors)
print(de_novo_heavy)
print(de_novo_light)
print(natural_heavy)
print(natural_light)

 12%|█▏        | 117/1000 [00:52<06:38,  2.22it/s]

Error in de novo sequence
Sequence: EVMMMMMMMMGQMPPPPSLSLSVSPGEPGEPASISISCKASSNGNTFLYWFRQKPGQSPEGLQRLIYKVSNRDAGVPDRFSGSGSGTDFTLTISRVEADDAGLYYCGQGIRDPPMFGQGTKLEIK


 12%|█▏        | 124/1000 [00:58<09:39,  1.51it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETATISCTGSSEHNFYIVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYTISGQSCYVFGGGTHLTVL
Natural Sequence: LPVLTQPTNASASLEESVKLTCTLSSEHNFYIVHWYQQQPGKAPRYLMYVRSDGSYKRGDGVPSRFSGSSSGADRYLTISNIKSEDEDDYYYCGADYTISGQSCYVFGGGTHLTVL
de novo CDRs:  SEHNFYI   VR   GADYTISGQSCYV
natural CDRs:  SEHNFYI   VR   CGADYTISGQSCYV


 18%|█▊        | 177/1000 [01:21<06:07,  2.24it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETATISCTGSSEHSNYIVNWYQQKPGKSPQWLIYVWSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGTDYRISGQYNGVFGGGTHLTVL
Natural Sequence: LPVLTQPTIASASLEESVKLTCTLSSEHSNYIVHWYRQQPGKAPRYLMCVWSDGTYKRGDGVPSRFSGSSSGADRYLTISNIKSEDEGDYYYCGTDYRISGQYNGVFGGGTQLTVL
de novo CDRs:  SEHSNYI   VW   GTDYRISGQYNGV
natural CDRs:  SEHSNYI   VW   CGTDYRISGQYNGV


 21%|██        | 212/1000 [01:37<05:47,  2.27it/s]

Error in de novo sequence
Sequence: EVVMMMMMMMMMMGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGPGPGPGPGPGPGPGPGPGPGPGPGPGPGPGPGPGPGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGSGGGGGG


 24%|██▎       | 237/1000 [01:48<05:36,  2.27it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSDHSNYIVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGTDYTISGQYKAVFGGGTHLTVL
Natural Sequence: LPVLTQPANASASLEESVKLTCTLSSDHSNYIVHWYQQQPGKAPRYLMYVRSDGNYKRGDGVPSRFSGSSSGADRYLTISNIKSDDEDDYYYCGTDYTISGQYKAVFGGGTHLTVL
de novo CDRs:  SDHSNYI   VR   GTDYTISGQYKAV
natural CDRs:  SDHSNYI   VR   CGTDYTISGQYKAV


 29%|██▉       | 294/1000 [02:13<05:11,  2.27it/s]

Error in de novo sequence
Sequence: EVTLLSLSVSPGETASISCKASSNGNTYLNWFRQKPGQSPQRLIYKVSNRDTGVPDRFSGSGSGTDFTLRISRVEADDTGVYYCGQVTQDPFTFGQGTKLEIK


 32%|███▏      | 315/1000 [02:23<05:02,  2.26it/s]

Error in de novo sequence
Sequence: EQLLESGGDLVKPGSLSCVASQNGNTYMYWVRQPGQGPQWLIYKVSNRDAGVPDRFSGSGSGTDFTLRISRVEADDAGVYYCMQGTHTPPTFGQGTKLEIK


 32%|███▎      | 325/1000 [02:27<04:55,  2.28it/s]

Error in de novo sequence
Sequence: EQLLESGGDLVKPGSLSCVASSYGNTFMYWFRQPGKGPQWLIYKVSNRDAGVPDRFSGSGSGTDFTLRISRVEADDTGVYYCGQGAQFPFTFGQGTKLEIK


 40%|████      | 400/1000 [03:01<04:22,  2.28it/s]

Error in de novo sequence
Sequence: EVQLLLESGGDLVKPAGSLRLSCVASQSHSDGNTYMNWFRQAPGKGPQWLIYLVSNRDAGVPDRFSGSGSGTDFTLRISRVEADDTGVYYCMQGTQFPWTFGQGTKLEIK


 48%|████▊     | 481/1000 [03:37<03:48,  2.27it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSEHSNYIVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYTISGQSGYVFGGGTHLTVL
Natural Sequence: LPVLTQPTNASASLGESVKLTCTLNSEHSNYIVHWYQQQPGKAPRYLMYVRSDGTYKRGDGVPSRFSGSSSGADRYLTISNIKSEDEDDYYYCGADYTISGQSGYVFGGGTHLTVL
de novo CDRs:  SEHSNYI   VR   GADYTISGQSGYV
natural CDRs:  SEHSNYI   VR   CGADYTISGQSGYV


 59%|█████▉    | 590/1000 [04:26<03:03,  2.23it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSEYSDYIVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYTISGHYGCVFGGGTHLTVL
Natural Sequence: LPVLTQPTNASASLEESVKLTCTLNSEYSDYIVHWYQQQRGKAPRYLMSVRTDGTYKRGDGVPGRFSGSGSGADRYLSISNIKAEDEDDYYYCGADYTISGHYGCVFGGGTHLTVL
de novo CDRs:  SEYSDYI   VR   GADYTISGHYGCV
natural CDRs:  SEYSDYI   VR   CGADYTISGHYGCV


 61%|██████    | 606/1000 [04:33<02:54,  2.26it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSDHSNFVVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYPISGQSHYVFGGGTHLTVL
Natural Sequence: LPVLTQPTNAFASLAESVRVTCTLTSDHSNFVVRWYQQQPGKAPRYLMYVRSDGYYQRGVGIPSRFSGSGSGTDRYLTISDIKSEDEGDYYFCGADYPISGQSHYVFGGGTHLTVL
de novo CDRs:  SDHSNFV   VR   GADYPISGQSHYV
natural CDRs:  SDHSNFV   VR   CGADYPISGQSHYV


 64%|██████▎   | 635/1000 [04:46<02:43,  2.23it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSGHSTYTVNWYQQKPGKSPQWLIYVRSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYTISGQSAYVFGGGTHLTVL
Natural Sequence: LPVLTQPTSSSASLEESVKLTCTLSSGHSTYTVHWYQHQPGKAPRYLMYVRSDGYYKRGDGVPSRFSGSSSGADRYLTISNIKSEDEDDYYYCGADYTISGQSAYVFGGGTHLTVL
de novo CDRs:  SGHSTYT   VR   GADYTISGQSAYV
natural CDRs:  SGHSTYT   VR   CGADYTISGQSAYV


 69%|██████▉   | 693/1000 [05:12<02:16,  2.25it/s]

Error in de novo sequence
Sequence: EQLVESGGDLVKPASLSCKASQSHSDGNTYMYWVRQPGQSPQRLIYKVSNRDPGVPDRFSGSGSGTDFTLRISRVEDAGVYYCGHGIEDPPTFGQGTKLEIK


 71%|███████▏  | 713/1000 [05:21<02:09,  2.21it/s]

Misaligned
De Novo Sequence: SPGLNQPPSVSASLGETASISCTGSSEHSNYIVNWYQQKPGKSPQWLIYVKSNRGDGVPSRFSGSSSGADRYLTISNIKSEDEDYYYCGADYVVSGQYNPVFGGGTHLTVL
Natural Sequence: LPVLTQPTITSASLEESVRLTCTLSSEHSNYIVHWYQQQPGKAPRFLLYVKSDGTYKRGDGVPSRFSGSSSGADRYLTISNIKSEDEGDFYYCGADYVVSGQYNPVFGGGTHLTVL
de novo CDRs:  SEHSNYI   VK   GADYVVSGQYNPV
natural CDRs:  SEHSNYI   VK   CGADYVVSGQYNPV


 79%|███████▉  | 791/1000 [05:55<01:33,  2.24it/s]

Misaligned
De Novo Sequence: SVTQPASVSGSLGQRVTISCTGGDGNYVGWYQQLPGKSPQTLIYDSSNRPSGVPDRFSGSRSGSTATLTISGLQAEDEADYYCSSYDTSLSTVFGGGTHLTVL
Natural Sequence: QSVLTQPASVSGSLGQRVTISCTNVGDGNYVGWHQQLPGTSPRSLIYDSSRRLSGVPDRFSGSRSGSTATLTISGLQAEDEADYYCSSYDTSLSTVFGGGTHLTVL
de novo CDRs:  DGNY   DS   SSYDTSLSTV
natural CDRs:  GDGNY   DS   SSYDTSLSTV


 96%|█████████▌| 955/1000 [07:09<00:19,  2.27it/s]

Error in de novo sequence
Sequence: EVQLVESGGDLVKPASLSCVASQSHSDGNTYMYWVRQAPGKGPQRLIYFVSNRDAGVPDRFSGSGSGTDFTLRISRVEADDTGVYYCMQGTHFPRTFGQGTKLEIK


100%|██████████| 1000/1000 [07:29<00:00,  2.23it/s]

17
8
626
366
626
374





In [11]:
print(de_novo_errors)
print(de_novo_heavy)
print(de_novo_light)
print(natural_heavy)
print(natural_light)

8
626
366
626
374


In [18]:
import os
import contextlib

scheme = "martin"
seq_de_novo = annotate(median_alignment_de_novo, scheme)
seq_natural = annotate(median_alignment_natural, scheme)

with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        result_de_novo = seq_de_novo.retrieve()
        result_natural = seq_natural.retrieve()

print("De Novo Result:", result_de_novo)
print("Natural Result:", result_natural)

# Extract CDRs for de novo sequence
cdr1_start_de_novo = median_alignment_de_novo.find(result_de_novo[0]['H-CDR1'])
cdr1_end_de_novo = cdr1_start_de_novo + len(result_de_novo[0]['H-CDR1']) - 1
cdr2_start_de_novo = median_alignment_de_novo.find(result_de_novo[0]['H-CDR2'])
cdr2_end_de_novo = cdr2_start_de_novo + len(result_de_novo[0]['H-CDR2']) - 1
cdr3_start_de_novo = median_alignment_de_novo.find(result_de_novo[0]['H-CDR3'])
cdr3_end_de_novo = cdr3_start_de_novo + len(result_de_novo[0]['H-CDR3']) - 1

# Extract CDRs for natural sequence
cdr1_start_natural = median_alignment_natural.find(result_natural[0]['H-CDR1'])
cdr1_end_natural = cdr1_start_natural + len(result_natural[0]['H-CDR1']) - 1
cdr2_start_natural = median_alignment_natural.find(result_natural[0]['H-CDR2'])
cdr2_end_natural = cdr2_start_natural + len(result_natural[0]['H-CDR2']) - 1
cdr3_start_natural = median_alignment_natural.find(result_natural[0]['H-CDR3'])
cdr3_end_natural = cdr3_start_natural + len(result_natural[0]['H-CDR3']) - 1

# Print CDRs for de novo sequence
print(f"H-CDR1 (De Novo) starts at index: {cdr1_start_de_novo}")
print(f"H-CDR1 (De Novo) ends at index: {cdr1_end_de_novo}")
print(f"H-CDR1 sequence from median_alignment_de_novo: {median_alignment_de_novo[cdr1_start_de_novo:cdr1_end_de_novo+1]}")
print(f"H-CDR1 sequence from result_de_novo: {result_de_novo[0]['H-CDR1']}")

print(f"H-CDR2 (De Novo) starts at index: {cdr2_start_de_novo}")
print(f"H-CDR2 (De Novo) ends at index: {cdr2_end_de_novo}")
print(f"H-CDR2 sequence from median_alignment_de_novo: {median_alignment_de_novo[cdr2_start_de_novo:cdr2_end_de_novo+1]}")
print(f"H-CDR2 sequence from result_de_novo: {result_de_novo[0]['H-CDR2']}")

print(f"H-CDR3 (De Novo) starts at index: {cdr3_start_de_novo}")
print(f"H-CDR3 (De Novo) ends at index: {cdr3_end_de_novo}")
print(f"H-CDR3 sequence from median_alignment_de_novo: {median_alignment_de_novo[cdr3_start_de_novo:cdr3_end_de_novo+1]}")
print(f"H-CDR3 sequence from result_de_novo: {result_de_novo[0]['H-CDR3']}")

# Print CDRs for natural sequence
print(f"H-CDR1 (Natural) starts at index: {cdr1_start_natural}")
print(f"H-CDR1 (Natural) ends at index: {cdr1_end_natural}")
print(f"H-CDR1 sequence from median_alignment_natural: {median_alignment_natural[cdr1_start_natural:cdr1_end_natural+1]}")
print(f"H-CDR1 sequence from result_natural: {result_natural[0]['H-CDR1']}")

print(f"H-CDR2 (Natural) starts at index: {cdr2_start_natural}")
print(f"H-CDR2 (Natural) ends at index: {cdr2_end_natural}")
print(f"H-CDR2 sequence from median_alignment_natural: {median_alignment_natural[cdr2_start_natural:cdr2_end_natural+1]}")
print(f"H-CDR2 sequence from result_natural: {result_natural[0]['H-CDR2']}")

print(f"H-CDR3 (Natural) starts at index: {cdr3_start_natural}")
print(f"H-CDR3 (Natural) ends at index: {cdr3_end_natural}")
print(f"H-CDR3 sequence from median_alignment_natural: {median_alignment_natural[cdr3_start_natural:cdr3_end_natural+1]}")
print(f"H-CDR3 sequence from result_natural: {result_natural[0]['H-CDR3']}")

# Check if CDRs are the same
cdr1_same = result_de_novo[0]['H-CDR1'] == result_natural[0]['H-CDR1']
cdr2_same = result_de_novo[0]['H-CDR2'] == result_natural[0]['H-CDR2']
cdr3_same = result_de_novo[0]['H-CDR3'] == result_natural[0]['H-CDR3']

print(f"CDR1 same: {cdr1_same}")
print(f"CDR2 same: {cdr2_same}")
print(f"CDR3 same: {cdr3_same}")

De Novo Result: [{'H-FR1': 'EGQLAESGGDLVKPGGSLRLSCVAS', 'H-CDR1': 'GITFSSHP', 'H-FR2': 'MSWVRQAPGKGLQWVAY', 'H-CDR2': 'ISSSGSVT', 'H-FR3': 'SYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYC', 'H-CDR3': 'AMQGCVGDSCPYYGIHY', 'H-FR4': 'WGPGTSLFVSS'}, {'H1': 'E', 'H2': 'G', 'H3': 'Q', 'H4': 'L', 'H5': 'A', 'H6': 'E', 'H7': 'S', 'H8': 'G', 'H9': 'G', 'H10': 'D', 'H11': 'L', 'H12': 'V', 'H13': 'K', 'H14': 'P', 'H15': 'G', 'H16': 'G', 'H17': 'S', 'H18': 'L', 'H19': 'R', 'H20': 'L', 'H21': 'S', 'H22': 'C', 'H23': 'V', 'H24': 'A', 'H25': 'S', 'H26': 'G', 'H27': 'I', 'H28': 'T', 'H29': 'F', 'H30': 'S', 'H31': 'S', 'H32': 'H', 'H33': 'P', 'H34': 'M', 'H35': 'S', 'H36': 'W', 'H37': 'V', 'H38': 'R', 'H39': 'Q', 'H40': 'A', 'H41': 'P', 'H42': 'G', 'H43': 'K', 'H44': 'G', 'H45': 'L', 'H46': 'Q', 'H47': 'W', 'H48': 'V', 'H49': 'A', 'H50': 'Y', 'H51': 'I', 'H52': 'S', 'H52A': 'S', 'H53': 'S', 'H54': 'G', 'H55': 'S', 'H56': 'V', 'H57': 'T', 'H58': 'S', 'H59': 'Y', 'H60': 'A', 'H61': 'D', 'H62': 'A', 'H63': 'V', 'H