In [1]:
import tmscoring

de_novo_filepath = '../../datasets/alphafold_test_output/antibody_de_novo.pdb'
original_filepath = '../../datasets/alphafold_test_output/antibody_real.pdb'

alignment = tmscoring.TMscoring(de_novo_filepath, original_filepath)
alignment.optimise()
print(alignment.tmscore(**alignment.get_current_values()))
print(alignment.tmscore_samples(**alignment.get_current_values()))
print(alignment.rmsd(**alignment.get_current_values()))

0.9961657282211568
[0.9065317  0.97646536 0.99066769 0.99943629 0.99890903 0.99974941
 0.99919408 0.9990635  0.99304587 0.99879329 0.99946023 0.99920663
 0.99860702 0.99953093 0.99966989 0.9994749  0.99404764 0.99794738
 0.99951838 0.99976513 0.9989943  0.99973341 0.99947068 0.99996409
 0.99173974 0.97946622 0.97217669 0.99324986 0.99753606 0.99450997
 0.995163   0.99690499 0.99958994 0.99958745 0.9981017  0.99974747
 0.99965754 0.99974237 0.9999491  0.99945208 0.98905179 0.94714656
 0.98124665 0.99985969 0.99957074 0.99821128 0.998696   0.99876352
 0.99969081 0.99919732 0.99982679 0.99936568 0.99979322 0.99961682
 0.99966988 0.99954144 0.999707   0.99956972 0.99842408 0.99288001
 0.99014968 0.99701254 0.99750572 0.9967157  0.99644031 0.99818758
 0.99972847 0.99978298 0.99955749 0.99934215 0.99991833 0.99986573
 0.99974039 0.9997296  0.99972779 0.99965105 0.99949743 0.99973922
 0.99961838 0.99994651 0.99996183 0.99960863 0.99936582 0.99960562
 0.99987396 0.99950064 0.99793754 0.9968109

Separability

In [1]:
import requests

class annotate():
    """
    class `annotate`. 
    
    Initiator `__init__` has 2 parameters:
    
    :param aaseq: STRING: A single-letter, amino acid sequence corresponding to the complete VH or VL chain. Both uppercase and lowercase are accepted. 
    
    :param scheme: STRING: "kabat", "chothia", "contact", or "imgt". Must be in lowercase
    
    Class has 3 methods. `retrieve()`: retrieves numbered seqs from Abnum website, then sends it to method `analyze` to determine the FR and CDR regions, and to `output() ` to print the result and return a list of 2 dictionaries, the first of which contains to region:seq pairs, the second of which contains number:residue pairs. 
    
    """
    
    def __init__(self, aaseq, scheme):
        
        self.aaseq=aaseq
        self.scheme=scheme
    
    def __repr__(self):
        return "Annotation of VH or VL sequence using Kabat, Chothia, Contact, or IMGT scheme"
    
    def output(self, chain, lst, regionlst):
        """
        Prints the FR and CDR regions and their corresponding seq. It returns a `list` of 2 `dict`. 
        
        :param chain: STRING, either "H" or "L" in uppercase
        :param lst:  LIST, a list of residue and their corresponding numbers in kabat or chothia scheme
        :param regionlst: LIST, a list of peptides, each corresponds to a FR or CDR region
        :return: LIST, a list of 2 `dict`, The first dict consists of region: seq pairs. The second dict consists of number:residue pairs.
        
        """
        
        self.chain=chain
        self.lst=lst
        self.regionlst=regionlst

        self.regiondict, self.numberdict={}, {}
        
        for i in range (0, len(self.lst), 2):
            self.numberdict[self.lst[i]]=self.lst[i+1]
        
        
        if self.scheme=="kabat":
            print("Annotation in Kabat scheme:")
        elif self.scheme=="chothia":
            print("Annotation in Chothia scheme:")
        elif self.scheme=="contact":
            print("Annotation in Contact scheme:")
        else:
            print("Annotation in IMGT scheme:")
        
        if self.chain=="L":
            print("L-FR1:  ", self.regionlst[0])
            print("L-CDR1: ", self.regionlst[1])
            print("L-FR2:  ", self.regionlst[2])
            print("L-CDR2: ", self.regionlst[3])
            print("L-FR3:  ", self.regionlst[4])
            print("L-CDR3: ", self.regionlst[5])
            print("L-FR4:  ", self.regionlst[6])
            
            for region, seq in zip(["L-FR1", "L-CDR1", "L-FR2","L-CDR2", "L-FR3", "L-CDR3", "L-FR4"], self.regionlst):
                self.regiondict[region]=seq
            
            return [self.regiondict, self.numberdict]
                
        else:
            print("H-FR1:  ", self.regionlst[0])
            print("H-CDR1: ", self.regionlst[1])
            print("H-FR2:  ", self.regionlst[2])
            print("H-CDR2: ", self.regionlst[3])
            print("H-FR3:  ", self.regionlst[4])
            print("H-CDR3: ", self.regionlst[5])
            print("H-FR4:  ", self.regionlst[6])
            
            for region, seq in zip(["H-FR1", "H-CDR1", "H-FR2","H-CDR2", "H-FR3", "H-CDR3", "H-FR4"], self.regionlst):
                self.regiondict[region]=seq
            
            return [self.regiondict, self.numberdict]
            
        
        
        

    
    def analyze(self,chain, lst):
        """
        Define CDR and FR regions based on the numbered sequence returned from website
        
        :param chain: STRING, "H" or "L" in uppercase
        :param lst: LIST, a list of residue and their corresponding numbers in kabat or chothia scheme
        :return: LIST, a list of strings, where each string is a peptide corresponding to the a region, in the order of: FR1, CDR1, FR2, CDR2, FR3, CDR3, FR4
        
        :raises: `ValueError` if any of the FR or CDR region is missing
        
        """
        
        self.chain=chain
        self.lst=lst
        if self.chain=="L":
            self.L_FR1, self.L_CDR1, self.L_FR2, self.L_CDR2, self.L_FR3, self.L_CDR3, self.L_FR4=["" for i in range (0, 7)]
            
            try:
                if self.scheme in ["kabat", "chothia"]:
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L24"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L24"), self.lst.index("L35"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L35"), self.lst.index("L50"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L50"), self.lst.index("L57"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L57"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L98"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L98"), len(self.lst), 2)])
                                    
                elif self.scheme =="contact": 
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L30"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L30"), self.lst.index("L37"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L37"), self.lst.index("L46"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L46"), self.lst.index("L56"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L56"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L97"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L97"), len(self.lst), 2)])
                                    
                else: #IMGT scheme
                    self.L_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("L27"), 2)])
                    self.L_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("L27"), self.lst.index("L33"), 2)])
                    self.L_FR2="".join([self.lst[i+1] for i in range (self.lst.index("L33"), self.lst.index("L50"), 2)])
                    self.L_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("L50"), self.lst.index("L52"), 2)])
                    self.L_FR3="".join([self.lst[i+1] for i in range (self.lst.index("L52"), self.lst.index("L89"), 2)])
                    self.L_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("L89"), self.lst.index("L98"), 2)])
                    self.L_FR4="".join([self.lst[i+1] for i in range (self.lst.index("L98"), len(self.lst), 2)])
                
                return [self.L_FR1, self.L_CDR1, self.L_FR2, self.L_CDR2, self.L_FR3, self.L_CDR3, self.L_FR4] 

            except ValueError:
                print("Unable to retrieve complete V region. Make sure the sequence has complete V region")
            except:
                print("An error occured")
        else:
            self.H_FR1, self.H_CDR1, self.H_FR2, self.H_CDR2, self.H_FR3, self.H_CDR3, self.H_FR4=["" for i in range (0, 7)]
            try:
                if self.scheme=="kabat":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H31"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H31"), self.lst.index("H36"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H36"), self.lst.index("H50"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H50"), self.lst.index("H66"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H66"), self.lst.index("H95"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H95"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])            
            
                elif self.scheme=="chothia":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H26"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H26"), self.lst.index("H33"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H33"), self.lst.index("H52"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H52"), self.lst.index("H57"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H57"), self.lst.index("H95"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H95"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])

                elif self.scheme=="contact":
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H30"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H30"), self.lst.index("H36"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H36"), self.lst.index("H47"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H47"), self.lst.index("H59"), 2)])
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H59"), self.lst.index("H93"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H93"), self.lst.index("H102"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H102"), len(self.lst), 2)])
                                        
                else: #IMGT scheme
                    self.H_FR1="".join([self.lst[i+1] for i in range (0, self.lst.index("H26"), 2)])
                    self.H_CDR1="".join([self.lst[i+1] for i in range (self.lst.index("H26"), self.lst.index("H34"), 2)])
                    self.H_FR2="".join([self.lst[i+1] for i in range (self.lst.index("H34"), self.lst.index("H51"), 2)])
                    self.H_CDR2="".join([self.lst[i+1] for i in range (self.lst.index("H51"), self.lst.index("H58"), 2)]) #51>57 (instead of 56)
                    self.H_FR3="".join([self.lst[i+1] for i in range (self.lst.index("H58"), self.lst.index("H93"), 2)])
                    self.H_CDR3="".join([self.lst[i+1] for i in range (self.lst.index("H93"), self.lst.index("H103"), 2)])
                    self.H_FR4="".join([self.lst[i+1] for i in range (self.lst.index("H103"), len(self.lst), 2)])                    
                
                return [self.H_FR1, self.H_CDR1, self.H_FR2, self.H_CDR2, self.H_FR3, self.H_CDR3, self.H_FR4]                    

            except ValueError:
                print("Unable to retrieve complete V region. Make sure the sequence has complete V region")
            except:
                print("An error occured in the `analyze()` method")
        
    def retrieve (self):
        """
        Retrieve numbered residues from Abnum website
        
        :return: returns same object from the `output()` method. 
        
        :raises: `ValueError` if input scheme is not among "kabat", "chothia", "contact", and "imgt"
        
        """
        
        self.url="http://www.bioinf.org.uk/abs/abnum/abnum.cgi"
        
        try: 
            if self.scheme not in ["kabat", "chothia", "contact", "imgt", "martin"]:
                raise Exception
            
        except ValueError:
            print("Incorrect scheme mode. Must be one of the following (lowercase): kabat, chothia, contact, imgt")
        
        else:
            if self.scheme=="kabat":
                self.sche="-k"
            else:
                self.sche="-c"
        
        try:
            self.d={"plain":1, "scheme":self.sche, "aaseq":self.aaseq}
            self.myPage=requests.get(self.url, params=self.d)
            self.text=self.myPage.text
            self.lst=self.text.split()
                
            if len(self.lst)>1:
                self.chain=self.lst[0][0]
                self.result=self.output(self.chain, self.lst, self.analyze(self.chain, self.lst))
                return self.result
            else:
                print("No annotation retrieved. Did you enter the complete VH or VL sequence?")
        except:
            print("An error occured in the `retrieve()` method")

In [3]:
seq = annotate("DIVMTQTPLSLSVSPGETASISCKASKSLLHRNGFTYLNWFRQKPGQSPQRLIYQMSNRDAGVPDRFSGSGSGTDFTLRISRVEADDAGVYYCAQNLELPFTFGQGTKLEIK", "martin")
print(seq.retrieve())

Annotation in IMGT scheme:
L-FR1:   DIVMTQTPLSLSVSPGETASISCKAS
L-CDR1:  KSLLHRNGFTY
L-FR2:   LNWFRQKPGQSPQRLIY
L-CDR2:  QM
L-FR3:   SNRDAGVPDRFSGSGSGTDFTLRISRVEADDAGVYYC
L-CDR3:  AQNLELPFT
L-FR4:   FGQGTKLEIK
[{'L-FR1': 'DIVMTQTPLSLSVSPGETASISCKAS', 'L-CDR1': 'KSLLHRNGFTY', 'L-FR2': 'LNWFRQKPGQSPQRLIY', 'L-CDR2': 'QM', 'L-FR3': 'SNRDAGVPDRFSGSGSGTDFTLRISRVEADDAGVYYC', 'L-CDR3': 'AQNLELPFT', 'L-FR4': 'FGQGTKLEIK'}, {'L1': 'D', 'L2': 'I', 'L3': 'V', 'L4': 'M', 'L5': 'T', 'L6': 'Q', 'L7': 'T', 'L8': 'P', 'L9': 'L', 'L10': 'S', 'L11': 'L', 'L12': 'S', 'L13': 'V', 'L14': 'S', 'L15': 'P', 'L16': 'G', 'L17': 'E', 'L18': 'T', 'L19': 'A', 'L20': 'S', 'L21': 'I', 'L22': 'S', 'L23': 'C', 'L24': 'K', 'L25': 'A', 'L26': 'S', 'L27': 'K', 'L28': 'S', 'L29': 'L', 'L30': 'L', 'L30A': 'H', 'L30B': 'R', 'L30C': 'N', 'L30D': 'G', 'L30E': 'F', 'L31': 'T', 'L32': 'Y', 'L33': 'L', 'L34': 'N', 'L35': 'W', 'L36': 'F', 'L37': 'R', 'L38': 'Q', 'L39': 'K', 'L40': 'P', 'L41': 'G', 'L42': 'Q', 'L43': 'S', 'L44': 'P

In [2]:
from Bio import SeqIO, SeqRecord
import os, contextlib
import pandas as pd
from tqdm import tqdm
import warnings
import json
from Bio import Align

small_ft_ground_truths_path = "../../datasets/outputs/small_ft_only_greedy/test/processed"

aligner = Align.PairwiseAligner()
aligner.mode = 'global'
aligner.open_gap_score = -0
aligner.extend_gap_score = -0
scheme="martin"

cdr1s = []
cdr2s = []
cdr3s = []

heavy_chains = 0
light_chains = 0

with open(small_ft_ground_truths_path) as handle:
    for k, record in enumerate(tqdm(handle.readlines())):
        if k % 3 == 0:
            seq=annotate(record,scheme)
            with open(os.devnull, 'w') as devnull:
                with contextlib.redirect_stdout(devnull):
                    # f()
                    result=seq.retrieve()
            if result[0].get("L-CDR1") is not None:
                cdr1s.append(result[0]["L-CDR1"])
                cdr2s.append(result[0]["L-CDR2"])
                cdr3s.append(result[0]["L-CDR3"])
                light_chains += 1
            else:
                cdr1s.append(result[0]["H-CDR1"])
                cdr2s.append(result[0]["H-CDR2"])
                cdr3s.append(result[0]["H-CDR3"])
                heavy_chains += 1

print(f"Light chains: {light_chains}")
print(f"Heavy chains: {heavy_chains}")
print(f"Ratio heavy to light: {heavy_chains/light_chains}")
json.dump(cdr1s, open("cdr1s.json", "w"))
json.dump(cdr2s, open("cdr2s.json", "w"))
json.dump(cdr3s, open("cdr3s.json", "w"))

100%|██████████| 3000/3000 [04:40<00:00, 10.70it/s]

Light chains: 374
Heavy chains: 626
Ratio heavy to light: 1.6737967914438503





In [2]:
from Bio import SeqIO, SeqRecord
import os, contextlib
import pandas as pd
from tqdm import tqdm
import warnings
from Bio.pairwise2 import format_alignment
import json
from Bio import Align

warnings.filterwarnings("ignore")

small_ft_full_data_path = "../../datasets/outputs/small_ft_full_beam/test/processed"
small_ft_data_path = "../../datasets/outputs/small_ft_beam/test/processed"
small_ft_only_data_path = "../../datasets/outputs/small_ft_only_beam/test/processed"

scheme="martin"

i = 0
j = 0
loss = 0
heavy_chains = 0
light_chains = 0

records = []
seqs = []

cdr1s = json.load(open("cdr1s.json"))
cdr2s = json.load(open("cdr2s.json"))
cdr3s = json.load(open("cdr3s.json"))

aligner = Align.PairwiseAligner()
aligner.mode = 'global'
aligner.open_gap_score = -0
aligner.extend_gap_score = -0

# with open(small_ft_full_data_path) as handle:
#     for k, record in enumerate(tqdm(handle.readlines())):
#         if k % 3 == 1:
#             # print(record)
#             seq=annotate(record,scheme)
#             with open(os.devnull, 'w') as devnull:
#                 with contextlib.redirect_stdout(devnull):
#                     # f()
#                     result=seq.retrieve()
#             # print(result)
#             if result != None:
#                 j+=1
#                 # Compare CDRs
#                 if result[0].get("L-CDR1") is not None:
#                     cdr1 = result[0]["L-CDR1"]
#                     cdr2 = result[0]["L-CDR2"]
#                     cdr3 = result[0]["L-CDR3"]
#                     light_chains += 1
#                 else:
#                     cdr1 = result[0]["H-CDR1"]
#                     cdr2 = result[0]["H-CDR2"]
#                     cdr3 = result[0]["H-CDR3"]
#                     heavy_chains += 1
#                 alignment = aligner.align(cdr1, cdr1s[i])
#                 loss += len(cdr1s[i]) - alignment[0].score

#                 alignment = aligner.align(cdr2, cdr2s[i])
#                 loss += len(cdr2s[i]) - alignment[0].score

#                 alignment = aligner.align(cdr3, cdr3s[i])
#                 loss += len(cdr3s[i]) - alignment[0].score
                
#             i+=1
# print("Percentage separable for small_ft_full")
# print(j/i)

# print("average loss per separable sequence")
# print(loss/j)
# print("heavy chains")
# print(heavy_chains)
# print("light chains")
# print(light_chains)

# i = 0
# j = 0
# loss = 0
# heavy_chains = 0
# light_chains = 0

# records = []
# seqs = []

# with open(small_ft_data_path) as handle:
#     for k, record in enumerate(tqdm(handle.readlines())):
#         if k % 3 == 1:
#             seq=annotate(record,scheme)
#             with open(os.devnull, 'w') as devnull:
#                 with contextlib.redirect_stdout(devnull):
#                     # f()
#                     result=seq.retrieve()
#             if result != None:
#                 j+=1
#                 if result[0].get("L-CDR1") is not None:
#                     cdr1 = result[0]["L-CDR1"]
#                     cdr2 = result[0]["L-CDR2"]
#                     cdr3 = result[0]["L-CDR3"]
#                     light_chains += 1
#                 else:
#                     cdr1 = result[0]["H-CDR1"]
#                     cdr2 = result[0]["H-CDR2"]
#                     cdr3 = result[0]["H-CDR3"]
#                     heavy_chains += 1
#                 alignment = aligner.align(cdr1, cdr1s[i])
#                 loss += len(cdr1s[i]) - alignment[0].score

#                 alignment = aligner.align(cdr2, cdr2s[i])
#                 loss += len(cdr2s[i]) - alignment[0].score

#                 alignment = aligner.align(cdr3, cdr3s[i])
#                 loss += len(cdr3s[i]) - alignment[0].score
#             i+=1

        
# print("Percentage separable for small_ft")
# print(j/i)
# print("average loss per separable sequence")
# print(loss/j)
# print("heavy chains")
# print(heavy_chains)
# print("light chains")
# print(light_chains)

i = 0
j = 0
loss = 0
heavy_chains = 0
light_chains = 0

records = []
seqs = []

with open(small_ft_only_data_path) as handle:
    for k, record in enumerate(tqdm(handle.readlines())):
        if k % 3 == 1:
            seq=annotate(record,scheme)
            with open(os.devnull, 'w') as devnull:
                with contextlib.redirect_stdout(devnull):
                    # f()
                    result=seq.retrieve()
            if result != None:
                j+=1
                if result[0].get("L-CDR1") is not None:
                    cdr1 = result[0]["L-CDR1"]
                    cdr2 = result[0]["L-CDR2"]
                    cdr3 = result[0]["L-CDR3"]
                    light_chains += 1
                else:
                    cdr1 = result[0]["H-CDR1"]
                    cdr2 = result[0]["H-CDR2"]
                    cdr3 = result[0]["H-CDR3"]
                    heavy_chains += 1
                alignment = aligner.align(cdr1, cdr1s[i])
                loss += len(cdr1s[i]) - alignment[0].score

                alignment = aligner.align(cdr2, cdr2s[i])
                loss += len(cdr2s[i]) - alignment[0].score

                alignment = aligner.align(cdr3, cdr3s[i])
                loss += len(cdr3s[i]) - alignment[0].score
            else:
                print(record)
            i+=1
print("Percentage separable for small_ft_only")
print(j/i)
print("average loss per separable sequence")
print(loss/j)

print("heavy chains")
print(heavy_chains)
print("light chains")
print(light_chains)

  0%|          | 8/3000 [00:00<03:53, 12.83it/s]





  1%|          | 29/3000 [00:02<03:47, 13.06it/s]





  1%|▏         | 44/3000 [00:03<03:54, 12.59it/s]





  2%|▏         | 62/3000 [00:04<03:39, 13.36it/s]





  4%|▍         | 125/3000 [00:10<03:45, 12.77it/s]





  5%|▍         | 140/3000 [00:11<03:33, 13.37it/s]





  5%|▌         | 152/3000 [00:12<03:49, 12.40it/s]

DPWSLVKPSLSCVSTSNIGGNNMSWVRQAPGKGLQWVAYSTTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCSSWDDRLSGYVWGQGALVTVSS



  5%|▌         | 161/3000 [00:12<03:41, 12.81it/s]





  6%|▌         | 173/3000 [00:13<03:25, 13.76it/s]



DPHSLKPCMQGTQDPWTWS



  6%|▌         | 185/3000 [00:14<03:13, 14.58it/s]





  7%|▋         | 200/3000 [00:15<03:17, 14.20it/s]





  8%|▊         | 233/3000 [00:18<03:25, 13.45it/s]





  8%|▊         | 242/3000 [00:18<03:16, 14.01it/s]





  9%|▉         | 281/3000 [00:21<03:08, 14.44it/s]





 11%|█         | 320/3000 [00:24<03:05, 14.49it/s]





 11%|█▏        | 344/3000 [00:26<03:05, 14.29it/s]

DP



 12%|█▏        | 353/3000 [00:27<03:03, 14.46it/s]





 12%|█▏        | 371/3000 [00:28<03:01, 14.49it/s]

DP



 13%|█▎        | 389/3000 [00:29<03:12, 13.59it/s]





 13%|█▎        | 398/3000 [00:30<03:03, 14.14it/s]





 14%|█▍        | 413/3000 [00:31<03:07, 13.81it/s]





 14%|█▍        | 422/3000 [00:32<03:17, 13.06it/s]





 15%|█▌        | 455/3000 [00:34<03:06, 13.64it/s]





 15%|█▌        | 464/3000 [00:35<03:01, 13.96it/s]





 16%|█▌        | 485/3000 [00:37<02:59, 14.04it/s]





 16%|█▋        | 491/3000 [00:37<02:50, 14.70it/s]





 17%|█▋        | 518/3000 [00:39<03:11, 12.98it/s]





 18%|█▊        | 545/3000 [00:41<02:46, 14.72it/s]





 18%|█▊        | 551/3000 [00:41<02:47, 14.60it/s]





 20%|█▉        | 593/3000 [00:45<03:07, 12.87it/s]





 20%|██        | 605/3000 [00:46<02:55, 13.63it/s]





 20%|██        | 614/3000 [00:46<02:50, 14.00it/s]





 21%|██        | 635/3000 [00:48<02:35, 15.17it/s]







 22%|██▏       | 659/3000 [00:50<02:51, 13.68it/s]





 22%|██▏       | 668/3000 [00:50<02:42, 14.34it/s]





 23%|██▎       | 683/3000 [00:51<02:41, 14.38it/s]





 25%|██▍       | 743/3000 [00:56<02:45, 13.64it/s]





 25%|██▌       | 764/3000 [00:57<02:30, 14.88it/s]







 26%|██▌       | 770/3000 [00:58<02:39, 13.99it/s]

DPWSLVKPSLSLSCVSNIGSKSMSWVRQAPGKGLQWVAYGDTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCQVWDNSAIVWGQGALVTVSS



 26%|██▌       | 776/3000 [00:58<02:26, 15.21it/s]

DPHSLKCGQVIQDPLTWS



 26%|██▋       | 791/3000 [00:59<02:33, 14.36it/s]

DPHSLKPCMQGIQTPYTWS



 27%|██▋       | 815/3000 [01:01<02:34, 14.15it/s]





 27%|██▋       | 821/3000 [01:02<02:26, 14.89it/s]

DPHSLKPCMQGIQTPYTWS



 29%|██▉       | 872/3000 [01:05<02:19, 15.23it/s]







 30%|███       | 911/3000 [01:08<02:27, 14.21it/s]





 31%|███       | 926/3000 [01:09<02:21, 14.61it/s]





 31%|███       | 929/3000 [01:09<02:22, 14.51it/s]





 32%|███▏      | 950/3000 [01:11<02:21, 14.47it/s]





 32%|███▏      | 962/3000 [01:12<02:06, 16.06it/s]



DPHSLKPCMQSIQTPPTWS



 32%|███▏      | 974/3000 [01:13<02:16, 14.89it/s]





 35%|███▌      | 1052/3000 [01:19<02:23, 13.60it/s]





 36%|███▌      | 1073/3000 [01:20<02:11, 14.70it/s]







 36%|███▌      | 1079/3000 [01:21<02:06, 15.15it/s]





 36%|███▌      | 1085/3000 [01:21<02:13, 14.35it/s]

DPWSLVKPSLSLSCVSQSLLHSNGNTYMSWVRQAPGKGLQWVAYLVTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCMQGMPGLPTWGQGTLVPVVK



 36%|███▋      | 1091/3000 [01:21<02:06, 15.08it/s]





 37%|███▋      | 1100/3000 [01:22<02:08, 14.77it/s]





 37%|███▋      | 1106/3000 [01:22<02:10, 14.54it/s]

DCDPKSLSCGQGTQFPLTWHPGHRLL



 38%|███▊      | 1127/3000 [01:24<02:08, 14.53it/s]







 38%|███▊      | 1130/3000 [01:24<02:03, 15.18it/s]





 39%|███▉      | 1169/3000 [01:27<02:17, 13.36it/s]





 39%|███▉      | 1181/3000 [01:28<02:12, 13.74it/s]





 40%|████      | 1202/3000 [01:30<02:02, 14.66it/s]

DCDPKSLSCGQGTQAPTWHPGHRLL



 40%|████      | 1211/3000 [01:30<01:58, 15.05it/s]





 41%|████      | 1232/3000 [01:32<02:01, 14.52it/s]





 41%|████▏     | 1241/3000 [01:32<01:58, 14.89it/s]





 42%|████▏     | 1268/3000 [01:34<01:50, 15.72it/s]







 44%|████▎     | 1310/3000 [01:37<01:59, 14.18it/s]





 45%|████▌     | 1364/3000 [01:41<01:57, 13.86it/s]

DPCPSLSSQSLLHRDGSSYWHRQAPGKGLQWVAYKVTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCGQGTQFPWTWGQGTLVTVSS





 46%|████▌     | 1373/3000 [01:42<01:57, 13.82it/s]





 46%|████▌     | 1379/3000 [01:42<01:49, 14.78it/s]





 46%|████▋     | 1391/3000 [01:43<01:55, 13.89it/s]





 47%|████▋     | 1400/3000 [01:44<01:51, 14.38it/s]





 47%|████▋     | 1409/3000 [01:44<01:43, 15.38it/s]







 47%|████▋     | 1418/3000 [01:45<01:52, 14.10it/s]





 48%|████▊     | 1427/3000 [01:46<01:50, 14.18it/s]





 48%|████▊     | 1451/3000 [01:48<01:50, 14.01it/s]





 49%|████▊     | 1460/3000 [01:48<01:42, 15.10it/s]







 50%|█████     | 1508/3000 [01:52<01:42, 14.53it/s]





 51%|█████     | 1517/3000 [01:52<01:39, 14.86it/s]

DPHSLKPCSSYDSSLSGVVWSS



 51%|█████     | 1523/3000 [01:53<01:37, 15.15it/s]





 52%|█████▏    | 1562/3000 [01:56<01:48, 13.20it/s]

DPCPSLSISQSLLRSNGNTYPWVRQAPGKGLQWVAYKVTYPDLKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCGQGTHSPYTWGQGTLVTVSS



 53%|█████▎    | 1586/3000 [01:57<01:38, 14.33it/s]





 53%|█████▎    | 1595/3000 [01:58<01:46, 13.24it/s]

DPCPSLSISQSLLRSDGKSYPWVRQAPGKGLQWVAYEATYPDLKSRFTISRDNAKNTLYLQMNSLRAEDTAVYYCGQGTHTPWTWGQGTLVTVSS



 54%|█████▎    | 1607/3000 [01:59<01:36, 14.42it/s]





 54%|█████▍    | 1631/3000 [02:01<01:39, 13.69it/s]





 56%|█████▌    | 1667/3000 [02:04<01:38, 13.52it/s]





 56%|█████▌    | 1676/3000 [02:04<01:34, 13.98it/s]





 57%|█████▋    | 1700/3000 [02:06<01:32, 14.06it/s]

DPHSLKPCGQTIQVPPTWS



 57%|█████▋    | 1715/3000 [02:07<01:28, 14.56it/s]





 59%|█████▉    | 1778/3000 [02:12<01:25, 14.31it/s]

DCDPKSLSCGQGTQGPPTWHPGHRLL



 60%|██████    | 1805/3000 [02:14<01:21, 14.69it/s]







 60%|██████    | 1808/3000 [02:14<01:16, 15.53it/s]





 61%|██████    | 1823/3000 [02:15<01:30, 13.03it/s]





 61%|██████    | 1835/3000 [02:16<01:25, 13.67it/s]







 62%|██████▏   | 1874/3000 [02:19<01:18, 14.34it/s]





 65%|██████▌   | 1961/3000 [02:26<01:13, 14.22it/s]





 66%|██████▌   | 1982/3000 [02:27<01:09, 14.66it/s]







 66%|██████▋   | 1988/3000 [02:28<01:05, 15.54it/s]





 67%|██████▋   | 1997/3000 [02:28<01:07, 14.85it/s]

DPHSLKPCMQGTQTPHTWS



 67%|██████▋   | 2006/3000 [02:29<01:11, 13.86it/s]

DPWSLVKPSLSLSCVSNIGSKSMSWVRQAPGKGLQWVAYYDTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCSTWDDSLSGIVWGQGTLVPVVK



 67%|██████▋   | 2021/3000 [02:30<01:06, 14.67it/s]





 68%|██████▊   | 2039/3000 [02:31<01:05, 14.61it/s]





 68%|██████▊   | 2048/3000 [02:32<01:04, 14.87it/s]





 70%|██████▉   | 2090/3000 [02:35<01:02, 14.47it/s]





 70%|███████   | 2111/3000 [02:37<01:01, 14.55it/s]

DP



 71%|███████▏  | 2141/3000 [02:39<01:03, 13.58it/s]





 72%|███████▏  | 2165/3000 [02:41<00:55, 15.01it/s]







 73%|███████▎  | 2183/3000 [02:42<00:58, 13.89it/s]





 73%|███████▎  | 2198/3000 [02:43<00:59, 13.48it/s]





 74%|███████▍  | 2213/3000 [02:45<01:04, 12.19it/s]





 74%|███████▍  | 2234/3000 [02:46<00:54, 14.14it/s]





 75%|███████▌  | 2252/3000 [02:47<00:52, 14.15it/s]





 75%|███████▌  | 2264/3000 [02:48<00:47, 15.49it/s]







 76%|███████▌  | 2285/3000 [02:50<00:52, 13.56it/s]





 76%|███████▋  | 2294/3000 [02:50<00:48, 14.56it/s]







 77%|███████▋  | 2303/3000 [02:51<00:49, 14.06it/s]

DCDPKSLSCGQGLHAPQTWHPGHRLL



 77%|███████▋  | 2312/3000 [02:52<00:48, 14.22it/s]

DCDPKSLSCGQGTQAPVTWHPGHRLL



 78%|███████▊  | 2336/3000 [02:54<00:46, 14.28it/s]





 79%|███████▊  | 2360/3000 [02:55<00:46, 13.63it/s]

DPWSLVKPSLSLSCVSNIGSKTMSWVRQAPGKGLQWVAYYTTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCQVWDNNVIVWGQGTLVPVVK



 80%|███████▉  | 2396/3000 [02:58<00:43, 13.93it/s]





 82%|████████▏ | 2453/3000 [03:03<00:43, 12.64it/s]

DCDPKSLSCGQGTHPPHTWHPGHRLL



 82%|████████▏ | 2468/3000 [03:04<00:38, 13.83it/s]





 83%|████████▎ | 2483/3000 [03:05<00:36, 14.29it/s]





 84%|████████▎ | 2510/3000 [03:07<00:35, 13.77it/s]





 85%|████████▍ | 2540/3000 [03:09<00:31, 14.71it/s]







 86%|████████▌ | 2573/3000 [03:12<00:30, 14.19it/s]





 86%|████████▋ | 2591/3000 [03:13<00:29, 13.79it/s]





 88%|████████▊ | 2642/3000 [03:17<00:24, 14.65it/s]







 88%|████████▊ | 2648/3000 [03:17<00:23, 15.06it/s]





 89%|████████▉ | 2666/3000 [03:19<00:23, 14.37it/s]





 89%|████████▉ | 2681/3000 [03:20<00:23, 13.84it/s]





 91%|█████████ | 2735/3000 [03:24<00:19, 13.67it/s]





 92%|█████████▏| 2750/3000 [03:25<00:18, 13.75it/s]





 92%|█████████▏| 2774/3000 [03:27<00:16, 14.05it/s]





 94%|█████████▍| 2819/3000 [03:30<00:13, 13.58it/s]





 97%|█████████▋| 2924/3000 [03:38<00:05, 13.25it/s]





 98%|█████████▊| 2936/3000 [03:39<00:04, 13.90it/s]





 98%|█████████▊| 2942/3000 [03:40<00:03, 14.76it/s]





 98%|█████████▊| 2945/3000 [03:40<00:03, 14.31it/s]

DPWSLVKPSLSCVSNLNKYYMSWVRQAPGKGLQWVAYRDTYADAVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCSAWDNGLTVVWGQGALVTVSS



100%|█████████▉| 2990/3000 [03:43<00:00, 14.22it/s]





100%|██████████| 3000/3000 [03:44<00:00, 13.37it/s]

Percentage separable for small_ft_only
0.841
average loss per separable sequence
0.0558858501783591
heavy chains
841
light chains
0



