In [164]:
import pandas as pd

## Import data

In [165]:
#clones_file = "cdr3-clones-AB-ADA-IGH_HUMAN-after-reassignment.csv" # 3MB file
clones_file = "cdr3-clones-AB-RBF-TRB_HUMAN-after-reassignment.csv" # 300 MB file
df = pd.read_csv(clones_file, sep="\t")

In [166]:
df.head()

Unnamed: 0,Sample,MID,cdr3pep,freq,uniq_umis,V_sub,J_sub,read_perc,umi_perc
0,AB-RBF002-Tu_S1,ACGTACGT,CASSAVAGGYNEQFFG,574,517,TRBV6-2+TRBV6-3,"TRBJ2-1,TRBJ2-5,TRBJ2-7",0.876577,0.992056
1,AB-RBF002-Tu_S1,ACGTACGT,CASSLVTDTQYFG,419,333,TRBV11-2,TRBJ2-3,0.63987,0.638984
2,AB-RBF002-Tu_S1,ACGTACGT,CSGARGEGTEAFFG,327,216,TRBV29-1,TRBJ1-1,0.499374,0.414476
3,AB-RBF002-Tu_S1,ACGTACGT,CASSTSPRGYTFG,311,247,TRBV19,TRBJ1-2,0.47494,0.473961
4,AB-RBF002-Tu_S1,ACGTACGT,CSVEEGGYTFG,297,189,TRBV29-1,TRBJ1-2,0.45356,0.362666


## Summary of the content

In [167]:
print("Table length:", len(df))
print("Nr of samples:", df["Sample"].nunique())
print("Nr of unique cdr3pep, V and J:", df[["cdr3pep","V_sub","J_sub"]].nunique())

Table length: 3501186
Nr of samples: 140
Nr of unique cdr3pep, V and J: cdr3pep    2645657
V_sub         1663
J_sub          537
dtype: int64


## Which CDR3's are shared?

In [168]:
# Group by cdr3pep and count samples
grouped = df.groupby('cdr3pep')['Sample'].count()
#grouped.head()

In [169]:
# Select entries that occur in more than one sample and sort on nr of samples
shared_cdr3 = grouped[grouped > 1]
shared_cdr3 = shared_cdr3.sort_values(ascending=False)
shared_cdr3.head()

cdr3pep
CASSFG           140
CASSLFG          134
CASSPFG          124
CASSPSTDTQYFG    121
CASSPQETQYFG     120
Name: Sample, dtype: int64

## Which Clones are shared?

In [170]:
# Group by cdr3pep, v and j. Count nr of samples, sort on nr_samples
clones_in_samples = df.groupby(['cdr3pep', 'V_sub', 'J_sub'])['Sample'].apply(list)
clones_in_samples = pd.DataFrame(clones_in_samples)
clones_in_samples["nr_samples"] = clones_in_samples['Sample'].apply(len)
clones_in_samples = clones_in_samples.sort_values('nr_samples', ascending=False)

In [171]:
total_unique_clones = len(clones_in_samples)
print('Total unique clones:', total_unique_clones)
clones_in_samples.head()

Total unique clones: 3013705


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sample,nr_samples
cdr3pep,V_sub,J_sub,Unnamed: 3_level_1,Unnamed: 4_level_1
CASSPQETQYFG,TRBV18,TRBJ2-5,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",86
CASSPPSTDTQYFG,TRBV18,TRBJ2-3,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",85
CSVGGNTEAFFG,TRBV29-1,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF009-Tu_S4, AB-RBF010-T...",76
CSVGGSNQPQHFG,TRBV29-1,TRBJ1-5,"[AB-RBF014-Tu_S6, AB-RBF017-Tu_S7, AB-RBF018-T...",70
CASSPGLNTEAFFG,TRBV18,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF007-Tu_S3, AB-RBF010-T...",70


In [172]:
# Select clones that occur in > 1 sample and sort on nr of samples
shared_clones = clones_in_samples.loc[clones_in_samples['nr_samples'] > 1]
shared_clones = shared_clones.sort_values('nr_samples', ascending=False)
total_shared_clones = len(shared_clones)
print('Shared clones:', total_shared_clones)
print('Perc shared clones:', 100 * total_shared_clones / total_unique_clones)
shared_clones.head()

Shared clones: 259274
Perc shared clones: 8.603164543311307


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sample,nr_samples
cdr3pep,V_sub,J_sub,Unnamed: 3_level_1,Unnamed: 4_level_1
CASSPQETQYFG,TRBV18,TRBJ2-5,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",86
CASSPPSTDTQYFG,TRBV18,TRBJ2-3,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",85
CSVGGNTEAFFG,TRBV29-1,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF009-Tu_S4, AB-RBF010-T...",76
CSVGGSNQPQHFG,TRBV29-1,TRBJ1-5,"[AB-RBF014-Tu_S6, AB-RBF017-Tu_S7, AB-RBF018-T...",70
CASSPGLNTEAFFG,TRBV18,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF007-Tu_S3, AB-RBF010-T...",70


In [173]:
# Reset indices
clones_in_samples = clones_in_samples.reset_index()
shared_clones = shared_clones.reset_index()
shared_clones.head()

Unnamed: 0,cdr3pep,V_sub,J_sub,Sample,nr_samples
0,CASSPQETQYFG,TRBV18,TRBJ2-5,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",86
1,CASSPPSTDTQYFG,TRBV18,TRBJ2-3,"[AB-RBF002-Tu_S1, AB-RBF003-Tu_S2, AB-RBF007-T...",85
2,CSVGGNTEAFFG,TRBV29-1,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF009-Tu_S4, AB-RBF010-T...",76
3,CSVGGSNQPQHFG,TRBV29-1,TRBJ1-5,"[AB-RBF014-Tu_S6, AB-RBF017-Tu_S7, AB-RBF018-T...",70
4,CASSPGLNTEAFFG,TRBV18,TRBJ1-1,"[AB-RBF002-Tu_S1, AB-RBF007-Tu_S3, AB-RBF010-T...",70


## Show V and J gene usage

In [174]:
# V J usage in all clones
v_j_usage_all = clones_in_samples.groupby(['V_sub', 'J_sub'])['cdr3pep'].count()
v_j_usage_all = v_j_usage_all.sort_values(ascending=False)
v_j_usage_all.head(25)

V_sub     J_sub  
TRBV29-1  TRBJ2-1    93302
          TRBJ1-1    91354
          TRBJ1-2    68546
          TRBJ2-3    64754
          TRBJ2-7    61844
TRBV19    TRBJ1-5    45512
TRBV29-1  TRBJ2-5    43544
TRBV19    TRBJ2-1    42754
TRBV18    TRBJ1-1    41951
          TRBJ2-1    35930
TRBV29-1  TRBJ2-2    33548
TRBV19    TRBJ1-2    32905
TRBV6-5   TRBJ2-1    32248
          TRBJ2-7    28117
          TRBJ1-1    28076
TRBV18    TRBJ2-7    27214
TRBV27    TRBJ2-7    25128
          TRBJ2-1    24831
TRBV6-5   TRBJ1-2    24086
TRBV27    TRBJ1-1    22940
TRBV18    TRBJ2-3    22654
TRBV29-1  TRBJ1-5    22162
TRBV18    TRBJ1-5    22011
          TRBJ1-2    21806
TRBV6-5   TRBJ2-3    21734
Name: cdr3pep, dtype: int64

In [175]:
# V J usage in shared clones
v_j_usage = shared_clones.groupby(['V_sub', 'J_sub'])['cdr3pep'].count()
v_j_usage = v_j_usage.sort_values(ascending=False)
v_j_usage.head(25)

V_sub     J_sub  
TRBV29-1  TRBJ1-1    13815
          TRBJ2-1    10435
          TRBJ1-2     8829
          TRBJ2-3     8785
          TRBJ2-7     8136
TRBV18    TRBJ1-1     6139
TRBV29-1  TRBJ2-5     5854
TRBV19    TRBJ1-5     4696
TRBV29-1  TRBJ2-2     4455
TRBV18    TRBJ2-1     4286
TRBV29-1  TRBJ1-5     4089
TRBV19    TRBJ2-1     3842
TRBV18    TRBJ2-7     3414
          TRBJ1-5     3354
          TRBJ2-3     3191
TRBV19    TRBJ1-2     2720
TRBV6-5   TRBJ1-1     2676
          TRBJ2-1     2619
          TRBJ2-7     2529
TRBV18    TRBJ1-2     2521
TRBV29-1  TRBJ1-4     2498
TRBV27    TRBJ1-1     2443
          TRBJ2-7     2297
          TRBJ1-5     2167
TRBV6-5   TRBJ1-5     2160
Name: cdr3pep, dtype: int64