In [1]:
import numpy as np
import pandas as pd

import Bio
import Bio.PDB
from Bio import SeqIO

## Combine seq_fragments of the same alleles together

In [2]:
def generate_fasta_for_all_alleles(fn):
    # Only include alleles which cover all the domains in C region, and exclude the membrane bound part
    results={}
    for record in SeqIO.parse(fn,"fasta"):
        title_lst=record.description.split("|")
        
        
        #ac_num=title_lst[0]
        o_allele_name=title_lst[1]
        species=title_lst[2].split("_")[0] 
        #.split() is used because sometimes the species is in this format: Homo sapiens_Lebanese individual (Lib-A2)
        
        if_functional=title_lst[3]
        if_partial=title_lst[13]# The label of the partial ones
        domain_info=title_lst[4]
        
        allele_name=f"{o_allele_name}|{species}"
        
        
        if (if_functional != "F" and if_functional !="(F)") or (allele_name[0:2]!="IG") or ("M" in title_lst[4]) or ("-LIKE" in title_lst[4]) or ("D-REGION" in title_lst[4]) or ("J-REGION" in title_lst[4]):
            # only include functional IG sequence
            # exclude the sequence of the membrane tail
            # exclude the -Like sequence, such as IGLL
            # exclude D&J genes
            continue
        
        if allele_name not in results.keys():
            results[allele_name]=[record.seq,if_partial,domain_info]
        else:
            results[allele_name][0]+=record.seq
            results[allele_name][1]+=if_partial
            results[allele_name][2]+=domain_info
    
    vref={}
    cref={}
    for k,v in results.items():
        if ("partial" not in v[1]) :
            # to exclude the partial_seq and the alleles which lack the ch1 domains
            if "V-REGION" in v[2]:
                vref[k]=v[0]
            else:
                if v[2][0:3]=="CH1" or v[2][0:8]=="C-REGION":
                    # only include C sequences starting with CH1 domain
                    cref[k]=v[0]
            
    return vref,cref

In [3]:
def write_fasta_files(allele_dict,out_fn):
    with open(out_fn,"w") as f:
        for title,allele_seq in allele_dict.items():
            title=title.replace(" ","_")
            f.write(f">{title}\n")
            f.write(f"{allele_seq}\n")

In [41]:
len("C-REGION")

8

In [4]:
!pwd

/Users/dongjung/Documents/all_species_vcab/seq_db/ref_db


In [36]:
t_total

{'IGHA*01|Bos taurus': [Seq('XSETSPSIFPLSLGNNDPAGQVVIGCLVQGFFPSAPLSVTWNQNGDSVSVRNFP...VCY'),
  '   ',
  'CH1H-CH2CH3-CHS'],
 'IGHD*03|Bos taurus': [Seq('EGESHLRVFPLVSCVSSPSDESTVALGCLARDFVPNSVSFSWKFNNSTVSSERF...DTG'),
  '     ',
  'CH1H1H2CH2CH3'],
 'IGHE*01|Bos taurus': [Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
  '    ',
  'CH1CH2CH3CH4-CHS'],
 'IGHE*02|Bos taurus': [Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
  '    ',
  'CH1CH2CH3CH4-CHS'],
 'IGHG1*01|Bos taurus': [Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
  '    ',
  'CH1HCH2CH3-CHS'],
 'IGHG1*02|Bos taurus': [Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
  '    ',
  'CH1HCH2CH3-CHS'],
 'IGHG1*03|Bos taurus': [Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
  '    ',
  'CH1HCH2CH3-CHS'],
 'IGHG1*04|Bos taurus': [Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
  '    ',
  'CH1HCH2CH3-CHS'],


In [4]:
t_vref,t_cref=generate_fasta_for_all_alleles("all_species_ref_seq_imgt.fasta")

In [5]:
len(t_vref),len(t_cref)

(2233, 388)

In [13]:
len(t_vref),len(t_cref)

(2233, 281)

In [15]:
{k:v for k,v in t_cref.items() if k.split("|")[1]=="Homo sapiens"}

{'IGHA1*01|Homo sapiens': Seq('ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFP...TCY'),
 'IGHA1*03|Homo sapiens': Seq('ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFP...TCY'),
 'IGHA2*01|Homo sapiens': Seq('ASPTSPKVFPLSLDSTPQDGNVVVACLVQGFFPQEPLSVTWSESGQNVTARNFP...TCY'),
 'IGHA2*03|Homo sapiens': Seq('ASPTSPKVFPLSLDSTPQDGNVVVACLVQGFFPQEPLSVTWSESGQNVTARNFP...TCY'),
 'IGHD*01|Homo sapiens': Seq('APTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWYMGTQSQPQRTF...PMK'),
 'IGHD*02|Homo sapiens': Seq('APTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWYMGTQSQPQRTF...PMK'),
 'IGHE*01|Homo sapiens': Seq('ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTCDTGSLNGTTMT...PGK'),
 'IGHE*02|Homo sapiens': Seq('ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTWDTGSLNGTTMT...PGK'),
 'IGHE*03|Homo sapiens': Seq('ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTCDTGSLNGTTMT...PGK'),
 'IGHE*04|Homo sapiens': Seq('ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTWDTGSLNGTTMT...PGK'),
 'IGHG1*01|Homo sapiens': Seq('ASTKGPSVFPLAPSSKSTSGGTAAL

In [16]:
{k:[i,len(i)] for k,i in t_vref.items() if len(str(i))<90 or len(str(i))>110}

{'IGIV1-2*01|Danio rerio': [Seq('QIVVTQYEVKSVQPGQTVSINCKVNTVVFKNDQGTTSKGRYYIHWYSQKPEEAP...WVF'),
  112]}

In [65]:
set([i.split("|")[1] for i in t_cref.keys()])

{'Bos taurus',
 'Canis lupus familiaris',
 'Danio rerio',
 'Equus caballus',
 'Gallus gallus',
 'Gorilla gorilla gorilla',
 'Homo sapiens',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Mus musculus',
 'Oncorhynchus mykiss',
 'Ornithorhynchus anatinus',
 'Oryctolagus cuniculus',
 'Rattus norvegicus',
 'Rattus rattus',
 'Salmo salar',
 'Sus scrofa',
 'Vicugna pacos'}

In [66]:
set([i.split("|")[1] for i in t_vref.keys()])

{'Bos taurus',
 'Camelus dromedarius',
 'Canis lupus familiaris',
 'Capra hircus',
 'Danio rerio',
 'Equus caballus',
 'Felis catus',
 'Gallus gallus',
 'Gorilla gorilla gorilla',
 'Homo sapiens',
 'Lemur catta',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Mus cookii',
 'Mus minutoides',
 'Mus musculus',
 'Mus musculus castaneus',
 'Mus musculus domesticus',
 'Mus musculus molossinus',
 'Mus musculus musculus',
 'Mus pahari',
 'Mus saxicola',
 'Mus spretus',
 'Oncorhynchus mykiss',
 'Ornithorhynchus anatinus',
 'Oryctolagus cuniculus',
 'Ovis aries',
 'Rattus norvegicus',
 'Salmo salar',
 'Sus scrofa',
 'Vicugna pacos'}

In [6]:
write_fasta_files(t_vref,"total_species_v_ref.fasta")
write_fasta_files(t_cref,"total_species_c_ref.fasta")

## Extract alleles with unique amino acis sequence
### At the same time, keep a record of the collapsed alleles

In [7]:
def classify_seq_by_species (a_dict):
    # a_dict: dictionary in this format: {allele_name(including_species):allele_seq}
    results={}
    
    for k,v in a_dict.items():
        a_name,species=k.split("|")
        if species not in results.keys():
            results[species]={a_name:v}
        else:
            results[species][a_name]=v
    return results

In [8]:
def find_unique_seq_within_one_species (alleles):
    # allels is a dictionary in this format {fasta_title:sequence}, 
    ## where fasta_title is like this "IGHG1*01"
    
    results={} # hold the unique alleles
    collapsed_alleles={} # in this format: {allele_name:list(collpased_allele_names(with the same sequence))}
    for k,v in alleles.items():
        chain_type,allele=k.split("*")
        allele_full_name=k
        if chain_type not in set([i.split("*")[0] for i in results.keys()]):
            # the new chain type
            results[k]=v
            collapsed_alleles[k]=[]
        else:
            new_seq=v
            old_seq=[v for k,v in results.items() if k.split("*")[0]==chain_type] # extract the allele seqs of the same chain type
            if new_seq not in old_seq:
                results[k]=v
                collapsed_alleles[k]=[]
            else:
                old_allele=[k for k,v in results.items() if v==new_seq][0]
                collapsed_alleles[old_allele].append(allele_full_name)
    return results,collapsed_alleles

In [11]:
def find_unique_seq_all_species (a_dict):
    # a_dict is a dictionary in this format: {allele_full_name_with_species:sequence}
    allele_by_species=classify_seq_by_species (a_dict)
    
    total_unique_a={}
    total_collapsed_a={}
    
    for species,a_info in allele_by_species.items():
        # find the unique alleles within one species
        species=species.replace(" ","_")
        unique_a,collapsed_a=find_unique_seq_within_one_species (a_info)
        
        total_unique_a[species]=unique_a
        total_collapsed_a[species]=collapsed_a
    
    return total_unique_a,total_collapsed_a
        

In [12]:
t_unique_vref,t_collapsed_vref=find_unique_seq_all_species (t_vref)
t_unique_cref,t_collapsed_cref=find_unique_seq_all_species (t_cref)

In [13]:
import json
def write_json_file(fn,out_dict):
    with open(fn, "w") as outfile:
        json.dump(out_dict, outfile)

In [14]:
write_json_file("all_species_collapsed_v_alleles.json",t_collapsed_vref)
write_json_file("all_species_collapsed_c_alleles.json",t_collapsed_cref)

In [15]:
t_collapsed_cref["Homo_sapiens"]

{'IGHA1*01': ['IGHA1*03'],
 'IGHA2*01': [],
 'IGHA2*03': [],
 'IGHD*01': ['IGHD*02'],
 'IGHE*01': [],
 'IGHE*02': ['IGHE*04'],
 'IGHE*03': [],
 'IGHG1*01': ['IGHG1*02',
  'IGHG1*05',
  'IGHG1*09',
  'IGHG1*10',
  'IGHG1*12',
  'IGHG1*14'],
 'IGHG1*03': ['IGHG1*06'],
 'IGHG1*04': [],
 'IGHG1*07': [],
 'IGHG1*08': [],
 'IGHG1*11': [],
 'IGHG1*13': [],
 'IGHG2*01': ['IGHG2*03',
  'IGHG2*05',
  'IGHG2*07',
  'IGHG2*08',
  'IGHG2*10',
  'IGHG2*12',
  'IGHG2*13'],
 'IGHG2*02': [],
 'IGHG2*04': [],
 'IGHG2*06': [],
 'IGHG2*09': [],
 'IGHG2*11': ['IGHG2*14'],
 'IGHG2*15': [],
 'IGHG3*01': ['IGHG3*05', 'IGHG3*10', 'IGHG3*21'],
 'IGHG3*03': [],
 'IGHG3*04': [],
 'IGHG3*06': ['IGHG3*07'],
 'IGHG3*08': [],
 'IGHG3*09': [],
 'IGHG3*11': ['IGHG3*28'],
 'IGHG3*12': [],
 'IGHG3*13': [],
 'IGHG3*14': ['IGHG3*27', 'IGHG3*29'],
 'IGHG3*15': [],
 'IGHG3*16': [],
 'IGHG3*17': [],
 'IGHG3*18': [],
 'IGHG3*19': [],
 'IGHG3*20': [],
 'IGHG3*22': [],
 'IGHG3*23': [],
 'IGHG3*24': [],
 'IGHG3*25': [],
 'IGHG3*2

In [16]:
set([i[0:4] for i in sum([list(t_unique_vref[k].keys()) for k in t_unique_vref.keys()],[])])

{'IGHV', 'IGIV', 'IGKV', 'IGLV'}

In [17]:
set([i[0:4] for i in sum([list(t_unique_cref[k].keys()) for k in t_unique_cref.keys()],[])])

{'IGHA',
 'IGHD',
 'IGHE',
 'IGHG',
 'IGHM',
 'IGHO',
 'IGHT',
 'IGHY',
 'IGIC',
 'IGKC',
 'IGLC'}

## Separate the unique alleles into Heavy and light

In [18]:
def separate_alleles_into_HL (a_dict):
    # 2 purposes:
    ## separate alleles into H alleles and L alleles
    ## combine the allele name with species to form a new dictionary, so that the dict is not nested, 
    ###which is convenient to write fasta files
    
    h_alleles={}
    l_alleles={}
    
    for species,a_info in a_dict.items():
        for a_name,seq in a_info.items():
            if a_name[0:3]=="IGH":
                h_alleles[f"{a_name}|{species}"]=seq
            else:
                l_alleles[f"{a_name}|{species}"]=seq
    return h_alleles, l_alleles
            

In [19]:
t_unique_vhref,t_unique_vlref=separate_alleles_into_HL (t_unique_vref)
t_unique_chref,t_unique_clref=separate_alleles_into_HL (t_unique_cref)

In [34]:
len(sum([list(i.values()) for i in t_unique_vref.values()],[]))

2102

In [20]:
len(t_vref),len(sum([list(i.values()) for i in t_unique_vref.values()],[])),len(t_unique_vhref),len(t_unique_vlref)

(2233, 2102, 1107, 995)

In [21]:
2102==1107+995

True

In [22]:
len(t_cref),len(sum([list(i.values()) for i in t_unique_cref.values()],[])),len(t_unique_chref),len(t_unique_clref)

(388, 347, 248, 99)

In [23]:
347==248+99

True

In [24]:
t_unique_chref

{'IGHA*01|Bos_taurus': Seq('XSETSPSIFPLSLGNNDPAGQVVIGCLVQGFFPSAPLSVTWNQNGDSVSVRNFP...VCY'),
 'IGHD*03|Bos_taurus': Seq('EGESHLRVFPLVSCVSSPSDESTVALGCLARDFVPNSVSFSWKFNNSTVSSERF...DTG'),
 'IGHE*01|Bos_taurus': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHE*02|Bos_taurus': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHG1*01|Bos_taurus': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*02|Bos_taurus': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*03|Bos_taurus': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*04|Bos_taurus': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*01|Bos_taurus': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*02|Bos_taurus': Seq('XSTTAPKVYPLSSCCGDKSSSGVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*03|Bos_taurus': Seq('ASTTAPKVYPLASSCGDTSSSTVTLGCLVSSYMPEPVTVTWNSGA

In [25]:
write_fasta_files(t_unique_chref,"all_species_unique_CH_alleles.fasta")
write_fasta_files(t_unique_clref,"all_species_unique_CL_alleles.fasta")

In [26]:
write_fasta_files(t_unique_vhref,"all_species_unique_VH_alleles.fasta")
write_fasta_files(t_unique_vlref,"all_species_unique_VL_alleles.fasta")

In [27]:
def combine_allele_name_species (a_dict):
    ## combine the allele name with species to form a new dictionary, so that the dict is not nested, 
    ###which is convenient to write fasta files
    
    ## Also label fasta title with H/L

    result={}
    for species,a_info in a_dict.items():
        for a_name,seq in a_info.items():
            chain_type="L"
            if a_name[0:3]=="IGH":
                chain_type="H"
            result[f"{a_name}|{species}|{chain_type}"]=seq
    return result

In [28]:
tc_unique_cref=combine_allele_name_species (t_unique_cref)

In [29]:
tc_unique_cref

{'IGHA*01|Bos_taurus|H': Seq('XSETSPSIFPLSLGNNDPAGQVVIGCLVQGFFPSAPLSVTWNQNGDSVSVRNFP...VCY'),
 'IGHD*03|Bos_taurus|H': Seq('EGESHLRVFPLVSCVSSPSDESTVALGCLARDFVPNSVSFSWKFNNSTVSSERF...DTG'),
 'IGHE*01|Bos_taurus|H': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHE*02|Bos_taurus|H': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHG1*01|Bos_taurus|H': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*02|Bos_taurus|H': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*03|Bos_taurus|H': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*04|Bos_taurus|H': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*01|Bos_taurus|H': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*02|Bos_taurus|H': Seq('XSTTAPKVYPLSSCCGDKSSSGVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*03|Bos_taurus|H': Seq('ASTTAPKVYPLASSCGDTSSSTV

In [30]:
tc_unique_vref=combine_allele_name_species (t_unique_vref)

In [31]:
len(tc_unique_cref),len(tc_unique_vref)

(347, 2102)

In [32]:
write_fasta_files(tc_unique_cref,"all_species_unique_C_alleles.fasta")
write_fasta_files(tc_unique_vref,"all_species_unique_V_alleles.fasta")

In [33]:
labled_tc_unique_cref={f"{k}|C":v for k,v in tc_unique_cref.items()}

In [34]:
labled_tc_unique_vref={f"{k}|V":v for k,v in tc_unique_vref.items()}

In [35]:
labled_tc_unique_cref

{'IGHA*01|Bos_taurus|H|C': Seq('XSETSPSIFPLSLGNNDPAGQVVIGCLVQGFFPSAPLSVTWNQNGDSVSVRNFP...VCY'),
 'IGHD*03|Bos_taurus|H|C': Seq('EGESHLRVFPLVSCVSSPSDESTVALGCLARDFVPNSVSFSWKFNNSTVSSERF...DTG'),
 'IGHE*01|Bos_taurus|H|C': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHE*02|Bos_taurus|H|C': Seq('XSIQAPSIYPLRLCCTEEARVRLGCLVKDYLPGSVTVTWDTVPLDGSTLTFPSI...SGK'),
 'IGHG1*01|Bos_taurus|H|C': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*02|Bos_taurus|H|C': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*03|Bos_taurus|H|C': Seq('ASTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG1*04|Bos_taurus|H|C': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*01|Bos_taurus|H|C': Seq('XSTTAPKVYPLSSCCGDKSSSTVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*02|Bos_taurus|H|C': Seq('XSTTAPKVYPLSSCCGDKSSSGVTLGCLVSSYMPEPVTVTWNSGALKSGVHTFP...AGK'),
 'IGHG2*03|Bos_taurus|H|C': Seq('A

In [36]:
all_labled_unique_alleles=labled_tc_unique_cref
all_labled_unique_alleles.update(labled_tc_unique_vref)

In [37]:
len(all_labled_unique_alleles),len(labled_tc_unique_cref),len(labled_tc_unique_vref)

(2449, 2449, 2102)

In [38]:
write_fasta_files(all_labled_unique_alleles,"all_species_unique_alleles_HLVC.fasta")

## Check: For the same alleles belong to the same species, if belongs to different individuals

In [108]:
def extract_all_fasta_titles (fn):
    results=[]
    for record in SeqIO.parse(fn,"fasta"):
        title_lst=record.description.split("|")
        
        results.append(title_lst)
    return pd.DataFrame(results,columns=["IMGT_accession","allele_name","species","functionality","domains","st_end_position","nt_length","codon_start","9","10","11","aa_length","13","partial","15","16"]).drop(columns=["16"])

In [110]:
fasta_titles=extract_all_fasta_titles ("all_species_ref_seq.fasta")

In [111]:
fasta_titles.columns

Index(['IMGT_accession', 'allele_name', 'species', 'functionality', 'domains',
       'st_end_position', 'nt_length', 'codon_start', '9', '10', '11',
       'aa_length', '13', 'partial', '15'],
      dtype='object')

In [112]:
fasta_titles["pure_species"]=[i.split("_")[0] for i in fasta_titles["species"]]

In [113]:
fasta_titles

Unnamed: 0,IMGT_accession,allele_name,species,functionality,domains,st_end_position,nt_length,codon_start,9,10,11,aa_length,13,partial,15,pure_species
0,KT723008,IGHA*01,Bos taurus_Holstein,F,CH1,"n,652007..652311",306 nt,1,+1,-1,,102 AA,102+0=102,,,Bos taurus
1,KT723008,IGHA*01,Bos taurus_Holstein,F,H-CH2,"g,652493..652821",330 nt,1,+1,-1,,110 AA,110+0=110,,,Bos taurus
2,KT723008,IGHA*01,Bos taurus_Holstein,F,CH3-CHS,"g,653011..653402",393 nt,1,+1,,,131 AA,131+0=131,,,Bos taurus
3,KT723008,IGHA*01,Bos taurus_Holstein,F,M,"g,655709..655893",186 nt,1,+1,,,62 AA,62+0=62,,,Bos taurus
4,KT723008,IGHD*02,Bos taurus_Holstein,ORF,CH1,"n,525572..525894",324 nt,1,+1,-1,,108 AA,108+0=108,,,Bos taurus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12377,AM939772,IGHV4S5*01,Vicugna pacos,F,V-REGION,1..291,291 nt,1,,,,97 AA,97+0=97,partial in 3',,Vicugna pacos
12378,AM939773,IGHV4S6*01,Vicugna pacos,F,V-REGION,1..291,291 nt,1,,,,97 AA,97+0=97,partial in 3',,Vicugna pacos
12379,AM939704,IGHV4S7*01,Vicugna pacos,F,V-REGION,1..288,288 nt,1,,,,96 AA,96+0=96,partial in 3',,Vicugna pacos
12380,AM939705,IGHV4S8*01,Vicugna pacos,F,V-REGION,1..288,288 nt,1,,,,96 AA,96+0=96,partial in 3',,Vicugna pacos


In [119]:
n_fasta_titles=fasta_titles.loc[(fasta_titles["functionality"]=="F") | (fasta_titles["functionality"]=="(F)")]

In [131]:
f_fasta_titles1=n_fasta_titles.loc[map(lambda x:("M" not in x) and ("-LIKE" not in x), n_fasta_titles["domains"])]

In [132]:
f_fasta_titles=f_fasta_titles1.loc[map(lambda x: x[0:2]=="IG",f_fasta_titles1["allele_name"])]

In [133]:
len(f_fasta_titles1),len(f_fasta_titles)

(8936, 4679)

In [134]:
for name, group in f_fasta_titles.groupby(["allele_name","pure_species"]):
    if len(set(group["species"])) != 1:
        print (group)

In [136]:
set(f_fasta_titles["domains"])

{'C-REGION',
 'CH1',
 'CH10',
 'CH2',
 'CH2D',
 'CH2D2',
 'CH2D3',
 'CH3',
 'CH3-CHS',
 'CH3D',
 'CH3D2',
 'CH3D3',
 'CH4',
 'CH4-CHS',
 'CH4D',
 'CH4D2',
 'CH4D3',
 'CH5',
 'CH6',
 'CH7',
 'CH7-CHS',
 'CH8',
 'CH9',
 'CHS',
 'CHX',
 'D-REGION',
 'H',
 'H-CH2',
 'H1',
 'H2',
 'H3',
 'H4',
 'J-REGION',
 'V-REGION'}

In [137]:
f_fasta_titles.loc[f_fasta_titles["domains"]=='C-REGION']

Unnamed: 0,IMGT_accession,allele_name,species,functionality,domains,st_end_position,nt_length,codon_start,9,10,11,aa_length,13,partial,15,pure_species
181,IMGT000047,IGKC*01,Bos taurus_Hereford,F,C-REGION,"n,203584..203906",324 nt,1,+1,,,108 AA,108+0=108,,,Bos taurus
199,IMGT000046,IGLC2*01,Bos taurus_Hereford,F,C-REGION,"g,459172..459488",318 nt,1,+1,,,106 AA,106+0=106,,,Bos taurus
200,IMGT000046,IGLC3*01,Bos taurus_Hereford,F,C-REGION,"g,465481..465797",318 nt,1,+1,,,106 AA,106+0=106,,,Bos taurus
201,IMGT000046,IGLC4*01,Bos taurus_Hereford,F,C-REGION,"g,471718..472034",318 nt,1,+1,,,106 AA,106+0=106,,,Bos taurus
202,IMGT000046,IGLC8*01,Bos taurus_Hereford,F,C-REGION,"g,506393..506709",318 nt,1,+1,,,106 AA,106+0=106,,,Bos taurus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11496,AC111360,IGLC4*01,Rattus norvegicus_BN/SsNHsdMCW,F,C-REGION,"g,182307..182620",315 nt,1,+1,,,105 AA,105+0=105,,,Rattus norvegicus
12009,FP312898,IGKC*01,Sus scrofa,F,C-REGION,"h,132117..132442",327 nt,1,+1,,,109 AA,109+0=109,,,Sus scrofa
12010,CU694848,IGKC*02,Sus scrofa,F,C-REGION,"h,28072..28397",327 nt,1,+1,,,109 AA,109+0=109,,,Sus scrofa
12045,CU467669,IGLC2*01,Sus scrofa,F,C-REGION,"g,2693..3009",318 nt,1,+1,,,106 AA,106+0=106,,,Sus scrofa


In [150]:
f_fasta_titles.loc[map(lambda x,y: ("partial" not in x) and (y=="V-REGION"), f_fasta_titles["partial"],f_fasta_titles["domains"])]

Unnamed: 0,IMGT_accession,allele_name,species,functionality,domains,st_end_position,nt_length,codon_start,9,10,11,aa_length,13,partial,15,pure_species
153,KT723008,IGHV1-10*01,Bos taurus_Holstein,F,V-REGION,241058..241350,293 nt,1,,,,97 AA,97+0=97,,,Bos taurus
155,KT723008,IGHV1-14*01,Bos taurus_Holstein,F,V-REGION,219454..219746,293 nt,1,,,,97 AA,97+0=97,,,Bos taurus
156,BosTau_1_chr21,IGHV1-14*02,Bos taurus_Hereford,F,V-REGION,90586..90878,293 nt,1,,,,97 AA,97+0=97,,,Bos taurus
158,KT723008,IGHV1-17*01,Bos taurus_Holstein,F,V-REGION,206029..206321,293 nt,1,,,,97 AA,97+0=97,,,Bos taurus
159,KT723008,IGHV1-20*01,Bos taurus_Holstein,F,V-REGION,192534..192826,293 nt,1,,,,97 AA,97+0=97,,,Bos taurus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12305,AM773729,IGHV3-2*01,Vicugna pacos,F,V-REGION,27864..28159,296 nt,1,,,,98 AA,98+0=98,,rev-compl,Vicugna pacos
12306,AM773729,IGHV3-3*01,Vicugna pacos,F,V-REGION,685..980,296 nt,1,,,,98 AA,98+0=98,,,Vicugna pacos
12307,AM773548,IGHV3S1*01,Vicugna pacos,F,V-REGION,6852..7147,296 nt,1,,,,98 AA,98+0=98,,,Vicugna pacos
12351,AM773548,IGHV3S53*01,Vicugna pacos,F,V-REGION,33845..34137,293 nt,1,,,,97 AA,97+0=97,,,Vicugna pacos


In [147]:
set(f_fasta_titles["partial"])

{' ', "partial in 3'", "partial in 5'", "partial in 5' and in 3' "}