# Converting NIH TCR data into usable format
### Ben Bekey

#### Using pandas dataframe, selected amino acid sequence, separated into columns, cleaned up data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#df_bcc1 = pd.read_excel('NIHMS1531727-supplement-2.xlsx', sheet_name = 'SuppTable3', skiprows = 3)
#df_bcc1.head()

In [3]:
# convert TCR data file to pandas dataframe, use column with amino acids
df = pd.read_csv('GSE123813_bcc_tcr.txt', sep = '\t', usecols = ['cdr3s_aa'] )
df

Unnamed: 0,cdr3s_aa
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,TRB:CASRLAGGLQETQYF
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,TRA:CAETILYSSASKIIF;TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,TRA:CIVSLSLVIYNQGGKLIF;TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,TRA:CAVERNTGGFKTIF;TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,TRA:CAGGNYGGATNKLIF;TRB:CASSLPGARVAFF
...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,TRB:CASSLRQGAGSNQPQHF
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,TRA:CAASENYKLSF;TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,TRA:CAVSTNDMRF;TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,TRB:CASSFLTGVNEQYF


In [4]:
# separate amino acid column into TRA and TRB
df[['TRA', 'TRB']] = df['cdr3s_aa'].str.split(';', n=1, expand=True)
df

Unnamed: 0,cdr3s_aa,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,TRB:CASRLAGGLQETQYF,TRB:CASRLAGGLQETQYF,
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,TRA:CAETILYSSASKIIF;TRB:CAWTTPGTSNSPLHF,TRA:CAETILYSSASKIIF,TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,TRA:CIVSLSLVIYNQGGKLIF;TRB:CASSSSWEGSPGEQYF,TRA:CIVSLSLVIYNQGGKLIF,TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,TRA:CAVERNTGGFKTIF;TRB:CASSQLGNGNQPQHF,TRA:CAVERNTGGFKTIF,TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,TRA:CAGGNYGGATNKLIF;TRB:CASSLPGARVAFF,TRA:CAGGNYGGATNKLIF,TRB:CASSLPGARVAFF
...,...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,TRB:CASSLRQGAGSNQPQHF,TRB:CASSLRQGAGSNQPQHF,
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,TRA:CAASENYKLSF;TRB:CASSGEGRYGYTF,TRA:CAASENYKLSF,TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,TRA:CAVSTNDMRF;TRB:CASSPIGSYEQYF,TRA:CAVSTNDMRF,TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,TRB:CASSFLTGVNEQYF,TRB:CASSFLTGVNEQYF,


In [5]:
# remove old amino acid column
del df['cdr3s_aa']
df

Unnamed: 0,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,TRB:CASRLAGGLQETQYF,
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,TRA:CAETILYSSASKIIF,TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,TRA:CIVSLSLVIYNQGGKLIF,TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,TRA:CAVERNTGGFKTIF,TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,TRA:CAGGNYGGATNKLIF,TRB:CASSLPGARVAFF
...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,TRB:CASSLRQGAGSNQPQHF,
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,TRA:CAASENYKLSF,TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,TRA:CAVSTNDMRF,TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,TRB:CASSFLTGVNEQYF,


In [6]:
# Move data from TRA to TRB if col contains the string "TRB:"
df.loc[df['TRA'].str.contains('TRB:'), 'TRB'] = df['TRA']
df

Unnamed: 0,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,TRB:CASRLAGGLQETQYF,TRB:CASRLAGGLQETQYF
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,TRA:CAETILYSSASKIIF,TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,TRA:CIVSLSLVIYNQGGKLIF,TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,TRA:CAVERNTGGFKTIF,TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,TRA:CAGGNYGGATNKLIF,TRB:CASSLPGARVAFF
...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,TRB:CASSLRQGAGSNQPQHF,TRB:CASSLRQGAGSNQPQHF
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,TRA:CAASENYKLSF,TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,TRA:CAVSTNDMRF,TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,TRB:CASSFLTGVNEQYF,TRB:CASSFLTGVNEQYF


In [7]:
# clear moved data from TRA
df.loc[df['TRA'].str.contains('TRB:'), 'TRA'] = ''
df

Unnamed: 0,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,,TRB:CASRLAGGLQETQYF
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,TRA:CAETILYSSASKIIF,TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,TRA:CIVSLSLVIYNQGGKLIF,TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,TRA:CAVERNTGGFKTIF,TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,TRA:CAGGNYGGATNKLIF,TRB:CASSLPGARVAFF
...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,,TRB:CASSLRQGAGSNQPQHF
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,TRA:CAASENYKLSF,TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,TRA:CAVSTNDMRF,TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,,TRB:CASSFLTGVNEQYF


In [8]:
# remove 'TRA:' from entries
df['TRA'] = df['TRA'].str.replace(r'TRA:', '')
df

Unnamed: 0,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,,TRB:CASRLAGGLQETQYF
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,CAETILYSSASKIIF,TRB:CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,CIVSLSLVIYNQGGKLIF,TRB:CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,CAVERNTGGFKTIF,TRB:CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,CAGGNYGGATNKLIF,TRB:CASSLPGARVAFF
...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,,TRB:CASSLRQGAGSNQPQHF
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,CAASENYKLSF,TRB:CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,CAVSTNDMRF,TRB:CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,,TRB:CASSFLTGVNEQYF


In [9]:
# remove 'TRB:' from entries
df['TRB'] = df['TRB'].str.replace(r'TRB:', '')
df

Unnamed: 0,TRA,TRB
bcc.su001.pre.tcell_AAACCTGCAGATCGGA,,CASRLAGGLQETQYF
bcc.su001.pre.tcell_AAACGGGTCATAGCAC,CAETILYSSASKIIF,CAWTTPGTSNSPLHF
bcc.su001.pre.tcell_AAAGATGAGACAGGCT,CIVSLSLVIYNQGGKLIF,CASSSSWEGSPGEQYF
bcc.su001.pre.tcell_AAAGATGCACAAGCCC,CAVERNTGGFKTIF,CASSQLGNGNQPQHF
bcc.su001.pre.tcell_AAAGATGTCTGAGTGT,CAGGNYGGATNKLIF,CASSLPGARVAFF
...,...,...
bcc.su012.post.tcell_TTTGCGCGTCTTTCAT,,CASSLRQGAGSNQPQHF
bcc.su012.post.tcell_TTTGGTTCAGCCAGAA,CAASENYKLSF,CASSGEGRYGYTF
bcc.su012.post.tcell_TTTGTCAAGCACCGTC,CAVSTNDMRF,CASSPIGSYEQYF
bcc.su012.post.tcell_TTTGTCAAGTGAACGC,,CASSFLTGVNEQYF


In [11]:
df.describe()
# all elements are strings, so Python only returns the top entry

Unnamed: 0,TRA,TRB
count,28408.0,27170
unique,10759.0,14420
top,,CATKGYQAGELFF
freq,7796.0,390


In [12]:
# count repeated entries of TRA, ranked from most to least
df['TRA'].value_counts()

TRA
                     7796
CAARGGNQGGKLIF        318
CAARTSGNTPLVF         104
CALSVMDSSYKLIF         83
CAMAGGGGSTLGRLYF       75
                     ... 
CAASRGGNNRKLIW          1
CVVSATTGNQFYF           1
CAASDGYSGAGSYQLTF       1
CAASARTGNQFYF           1
CAFIALNNNDMRF           1
Name: count, Length: 10759, dtype: int64

In [13]:
# same for TRB
df['TRB'].value_counts()

TRB
CATKGYQAGELFF       390
CASSLAGTSPSNEQFF    157
CASSIDWTGYLDTQYF    102
CASSLNPGADTQYF       97
CASSGEVTGGPYEQYF     95
                   ... 
CASGNLDEQFF           1
CSARSTGTPYGYTF        1
CASSQAPGGYGYTF        1
CSVDSSSQETQYF         1
CASSLQQGYEQYF         1
Name: count, Length: 14420, dtype: int64

In [18]:
#count unique combinations of TRA and TRB
df.groupby(['TRA', 'TRB']).size().reset_index().rename(columns={0:"count"})

Unnamed: 0,TRA,TRB,count
0,,CAAATSGSGTDTQYF,1
1,,CAAGGQTSGYTF,1
2,,CAAGSGLSSPGELFF,1
3,,CAAISGANVLTF,1
4,,CAALAGGGETQYF,1
...,...,...,...
16292,CVYTGGFKTIF,CAWRAGSSYEQYF,1
16293,CYFSGGYNKLIF,CASSTGKTGGNEQFF,1
16294,CYTGNQFYF,CASNPWGVESPLHF;CAWSVKGAGGNQPQHF,1
16295,CYTGNQFYF,CAWSVKGAGGNQPQHF,1
