__Code for parsing through a 10x output file containing clonotype information and subsetting/formating to be used with TCRdist software__

1. Import packages and load file
2. filter out non-cells, low confidence and multiple assignments
3. generate a new dataset containing the corresponding TRB and TRA for the same contig
4. rename columns of the dataset to reflect format for TCRdist
5. add columns for epitope, subject, a_quals and b_quals
6. reorder columns


In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("all_contig_annotations.csv")
data.head()


Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,AAACCTGCAGGGTACA-1,True,AAACCTGCAGGGTACA-1_contig_1,True,509,TRB,TRBV6-2,,TRBJ2-4,TRBC2,True,True,CASSYEPLRGGDIQYF,TGTGCCAGCAGTTACGAGCCTCTGAGAGGCGGGGACATTCAGTACTTC,5340,1,clonotype9,clonotype9_consensus_2
1,AAACCTGCAGGGTACA-1,True,AAACCTGCAGGGTACA-1_contig_2,True,456,TRA,TRAV13-1,,TRAJ41,TRAC,True,True,CAASNSGYALNF,TGTGCAGCGTCAAATTCCGGGTATGCACTCAACTTC,7964,2,clonotype9,clonotype9_consensus_1
2,AAACCTGGTCTACCTC-1,True,AAACCTGGTCTACCTC-1_contig_1,True,502,TRB,TRBV6-2,,TRBJ2-1,TRBC2,True,True,CASGGQLRYNEQFF,TGTGCCAGCGGGGGACAGCTAAGATACAATGAGCAGTTCTTC,24972,6,clonotype108,clonotype108_consensus_1
3,AAACCTGGTCTACCTC-1,True,AAACCTGGTCTACCTC-1_contig_2,True,485,TRA,TRAV38-2/DV8,,TRAJ43,TRAC,True,True,CAYRSYNDMRF,TGTGCTTATAGGAGTTACAATGACATGCGCTTT,5704,1,clonotype108,clonotype108_consensus_2
4,AAACGGGAGATGCGAC-1,True,AAACGGGAGATGCGAC-1_contig_1,True,512,TRB,TRBV7-9,,TRBJ2-3,TRBC2,True,True,CASSLTGGARKDTQYF,TGTGCCAGCAGCTTAACAGGGGGCGCCAGGAAAGATACGCAGTATTTT,28742,5,clonotype109,clonotype109_consensus_1


In [3]:
data.shape

(4415, 18)

In [4]:
data= data[(data.is_cell == True) &            # remove non-cells
     (data.high_confidence == True) &          # remove contigs with low confidence call
     (data.cdr3_nt != "None") &                # remove nt sequences with no assignment
     (data.chain != "Multi") &                 # remove contigs assigned to multiple chains
     (data.productive != False)]               # remove non-productive calls by cellranger
        

In [5]:
data.head()

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,AAACCTGCAGGGTACA-1,True,AAACCTGCAGGGTACA-1_contig_1,True,509,TRB,TRBV6-2,,TRBJ2-4,TRBC2,True,True,CASSYEPLRGGDIQYF,TGTGCCAGCAGTTACGAGCCTCTGAGAGGCGGGGACATTCAGTACTTC,5340,1,clonotype9,clonotype9_consensus_2
1,AAACCTGCAGGGTACA-1,True,AAACCTGCAGGGTACA-1_contig_2,True,456,TRA,TRAV13-1,,TRAJ41,TRAC,True,True,CAASNSGYALNF,TGTGCAGCGTCAAATTCCGGGTATGCACTCAACTTC,7964,2,clonotype9,clonotype9_consensus_1
2,AAACCTGGTCTACCTC-1,True,AAACCTGGTCTACCTC-1_contig_1,True,502,TRB,TRBV6-2,,TRBJ2-1,TRBC2,True,True,CASGGQLRYNEQFF,TGTGCCAGCGGGGGACAGCTAAGATACAATGAGCAGTTCTTC,24972,6,clonotype108,clonotype108_consensus_1
3,AAACCTGGTCTACCTC-1,True,AAACCTGGTCTACCTC-1_contig_2,True,485,TRA,TRAV38-2/DV8,,TRAJ43,TRAC,True,True,CAYRSYNDMRF,TGTGCTTATAGGAGTTACAATGACATGCGCTTT,5704,1,clonotype108,clonotype108_consensus_2
4,AAACGGGAGATGCGAC-1,True,AAACGGGAGATGCGAC-1_contig_1,True,512,TRB,TRBV7-9,,TRBJ2-3,TRBC2,True,True,CASSLTGGARKDTQYF,TGTGCCAGCAGCTTAACAGGGGGCGCCAGGAAAGATACGCAGTATTTT,28742,5,clonotype109,clonotype109_consensus_1


In [6]:
data.shape

(1867, 18)

In [7]:
tcra = data[(data.chain == 'TRA')]
tcrb = data[(data.chain == 'TRB')]

In [8]:
tcra.shape 

(532, 18)

In [9]:
tcrb.shape 

(1335, 18)

In [10]:
tcrb.columns

Index(['barcode', 'is_cell', 'contig_id', 'high_confidence', 'length', 'chain',
       'v_gene', 'd_gene', 'j_gene', 'c_gene', 'full_length', 'productive',
       'cdr3', 'cdr3_nt', 'reads', 'umis', 'raw_clonotype_id',
       'raw_consensus_id'],
      dtype='object')

In [11]:
tcra= tcra[['barcode', 'chain', 'cdr3_nt']]
tcrb = tcrb[['barcode', 'chain', 'cdr3_nt']]

In [12]:
# merge by barcode
tcr = pd.merge(tcra,tcrb, on=['barcode'])

In [13]:
tcr = tcr[['barcode', "cdr3_nt_x", 'cdr3_nt_y']]
tcr.columns = ['id', 'a_nucseq', 'b_nucseq']

In [14]:
tcr['epitope']= 'NA'
tcr['subject']= 'human'
tcr['a_quals']= '30'
tcr['b_quals']= '30'

                               

In [15]:
tcr

Unnamed: 0,id,a_nucseq,b_nucseq,epitope,subject,a_quals,b_quals
0,AAACCTGCAGGGTACA-1,TGTGCAGCGTCAAATTCCGGGTATGCACTCAACTTC,TGTGCCAGCAGTTACGAGCCTCTGAGAGGCGGGGACATTCAGTACTTC,,human,30,30
1,AAACCTGGTCTACCTC-1,TGTGCTTATAGGAGTTACAATGACATGCGCTTT,TGTGCCAGCGGGGGACAGCTAAGATACAATGAGCAGTTCTTC,,human,30,30
2,AAACGGGGTCTCTTTA-1,TGTGCTGTGAGAGCCCTGCCCTTCCATACCGGCACTGCCAGTAAAC...,TGTGCCAGCAGCGCCGGGACAGGGGGTGAAAAACTGTTTTTT,,human,30,30
3,AAACGGGTCATGTCTT-1,TGTGCAGCAAGCGTAGACAACTTCAACAAATTTTACTTT,TGTGCCAGCAGTTACCCTAGCACAGATACGCAGTATTTT,,human,30,30
4,AAACGGGTCTGTCTCG-1,TGTGCTGTGAATAGTGGAGGTAGCAACTATAAACTGACATTT,TGTGCCAGCAGCTTAGCTGCGGGCGGGCTCTACAATGAGCAGTTCTTC,,human,30,30
...,...,...,...,...,...,...,...
498,TTGGCAAGTGCCTGGT-1,TGTGTGGTGAACGCCATCTTTGGAAATGAGAAATTAACCTTT,TGTGCCAGCAGTCCCACTCGGGCAGGGGCCTACAATGAGCAGTTCTTC,,human,30,30
499,TTGGCAAGTGCCTGGT-1,TGTGTGGTGAACGCCATCTTTGGAAATGAGAAATTAACCTTT,TGTGCCAGCACCGCCGGGACAGGGGGCTACGAGCAGTACTTC,,human,30,30
500,TTGTAGGGTACTCTCC-1,TGCATCGTCAAGGCCTCTGGGGCTGGGAGTTACCAACTCACTTTC,TGTGCCAGTAGTCCGCGGGGTCCAAGCCCTGGGGCCAACGTCCTGA...,,human,30,30
501,TTGTAGGGTCGAATCT-1,TGTGCCCGATACAACTTCAACAAATTTTACTTT,TGTGCCAGCAGCCAAGATCCAGGGGCTAGCGGGTGGGATGAGCAGT...,,human,30,30


In [16]:
# write csv file
tcr.to_csv("10xToTCRdist_191010.tsv", sep="\t", index= False, header=True)

Load this tsv file into TCRdist version one! 