# Persistent clonotype and sequence fraction subsampling

In this notebook we will calculate the percent of shared clonotypes and sequences verses sample depth. Subsample sizes range from 1000 to 1,000,000 sequences per biological replicate.

The [`abutils`](https://www.github.com/briney/abutils) Python package is required for this notebook, and can be installed by running `pip install abutils`.

*NOTE: this notebook requires the use of the Unix command line tool `shuf`. Thus, it requires a Unix-based operating system to run correctly (MacOS and most flavors of Linux should be fine). Running this notebook on Windows 10 may be possible using the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about) but we have not tested this.*

In [10]:
from __future__ import print_function, division

from collections import Counter
import os
import subprocess as sp
import sys
import tempfile

import scipy.stats as st
import numpy as np

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from abutils.utils.pipeline import make_dir, list_files

## Make directories

In [15]:
make_dir('./data/persistent-clonotypes/')
make_dir('./data/persistent-sequences/')

## Subsampling clonotypes

In [6]:
infile1='./data/dedup_year-merged_vj-aa/D103-2016/D103-2016_dedup_pool_vj-aa.txt'
infile2='./data/dedup_year-merged_vj-aa/D103-2021/D103-2021_dedup_pool_vj-aa.txt'

persistent_number={}
for subsample_count in [1000, 10000, 100000, 200000, 400000, 600000, 800000, 1000000]:
    print(subsample_count)
    persistent_number[str(subsample_count)]=[]
    for i in range(10):
        shuf_cmd1 = 'shuf -n {} {}'.format(str(subsample_count), infile1)
        p = sp.Popen(shuf_cmd1, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout1, stderr1 = p.communicate()

        shuf_cmd2 = 'shuf -n {} {}'.format(str(subsample_count), infile2)
        p = sp.Popen(shuf_cmd2, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout2, stderr2 = p.communicate()

        tp1_clonotype_subsample=stdout1.split('\n')[:-1]
        tp2_clonotype_subsample=stdout2.split('\n')[:-1]

        clone_counts=Counter(tp1_clonotype_subsample+tp2_clonotype_subsample)

        persistent_clones=[c for c in clone_counts if clone_counts[c]==2]
        persistent_number[str(subsample_count)].append(len(persistent_clones))

with open(f'./data/persistent-clonotypes/D103-{str(subsample_count)}subsample-{str(i)}', 'w') as file:
    file.write('\n'.join(persistent_clones))
    

1000
10000
100000
200000
400000
600000
800000
1000000


In [13]:
pdata=[]
for i in persistent_number.keys():
    data=persistent_number[i]
    data=[_i/int(i) for _i in data]
    intervals=st.t.interval(0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    pdata.append({'size':i, 
                  'average': np.mean(data)*100, 
                  'upper': intervals[1]*100, 
                  'lower': intervals[0]*100})
pdf=pd.DataFrame(pdata)

In [None]:
pdf.to_csv('./data/persistent-clonotypes/D103-df.csv')

In [None]:
infile1='./data/dedup_year-merged_vj-aa/327059-2016/327059-2016_dedup_pool_vj-aa.txt'
infile2='./data/dedup_year-merged_vj-aa/327059-2020/327059-2020_dedup_pool_vj-aa.txt'

persistent_number={}
for subsample_count in [1000, 10000, 100000, 200000, 400000, 600000, 800000, 1000000]:
    print(subsample_count)
    persistent_number[str(subsample_count)]=[]
    for i in range(10):
        shuf_cmd1 = 'shuf -n {} {}'.format(str(subsample_count), infile1)
        p = sp.Popen(shuf_cmd1, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout1, stderr1 = p.communicate()

        shuf_cmd2 = 'shuf -n {} {}'.format(str(subsample_count), infile2)
        p = sp.Popen(shuf_cmd2, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout2, stderr2 = p.communicate()

        tp1_clonotype_subsample=stdout1.split('\n')[:-1]
        tp2_clonotype_subsample=stdout2.split('\n')[:-1]

        clone_counts=Counter(tp1_clonotype_subsample+tp2_clonotype_subsample)

        persistent_clones=[c for c in clone_counts if clone_counts[c]==2]
        persistent_number[str(subsample_count)].append(len(persistent_clones))

with open(f'./data/persistent-clonotypes/327059-{str(subsample_count)}subsample-{str(i)}', 'w') as file:
    file.write('\n'.join(persistent_clones))

In [None]:
pdata=[]
for i in persistent_number.keys():
    data=persistent_number[i]
    data=[_i/int(i) for _i in data]
    intervals=st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    pdata.append({'size':i, 
                  'average': np.mean(data)*100, 
                  'upper': intervals[1]*100, 
                  'lower': intervals[0]*100})
pdf=pd.DataFrame(pdata)

In [None]:
pdf.to_csv('./data/persistent-clonotypes/327059-df.csv')

## Subsampling sequences

In [6]:
infile1='./data/dedup_year-merged_nt-seq/D103-2016/D103-2016_dedup_pool_nt-seq.txt'
infile2='./data/dedup_year-merged_nt-seq/D103-2021/D103-2021_dedup_pool_nt-seq.txt'

persistent_number={}
for subsample_count in [1000, 10000, 100000, 200000, 400000, 600000, 800000, 1000000]:
    print(subsample_count)
    persistent_number[str(subsample_count)]=[]
    for i in range(10):
        shuf_cmd1 = 'shuf -n {} {}'.format(str(subsample_count), infile1)
        p = sp.Popen(shuf_cmd1, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout1, stderr1 = p.communicate()

        shuf_cmd2 = 'shuf -n {} {}'.format(str(subsample_count), infile2)
        p = sp.Popen(shuf_cmd2, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout2, stderr2 = p.communicate()

        tp1_clonotype_subsample=stdout1.split('\n')[:-1]
        tp2_clonotype_subsample=stdout2.split('\n')[:-1]

        clone_counts=Counter(tp1_clonotype_subsample+tp2_clonotype_subsample)

        persistent_clones=[c for c in clone_counts if clone_counts[c]==2]
        persistent_number[str(subsample_count)].append(len(persistent_clones))

with open(f'./data/persistent-clonotypes/D103-{str(subsample_count)}subsample-{str(i)}', 'w') as file:
    file.write('\n'.join(persistent_clones))
    

1000
10000
100000
200000
400000
600000
800000
1000000


In [13]:
pdata=[]
for i in persistent_number.keys():
    data=persistent_number[i]
    data=[_i/int(i) for _i in data]
    intervals=st.t.interval(0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    pdata.append({'size':i, 
                  'average': np.mean(data)*100, 
                  'upper': intervals[1]*100, 
                  'lower': intervals[0]*100})
pdf=pd.DataFrame(pdata)

In [None]:
pdf.to_csv('./data/persistent-sequences/D103-df.csv')

In [None]:
infile1='./data/dedup_year-merged_nt-seq/327059-2016/327059-2016_dedup_pool_nt-seq.txt'
infile2='./data/dedup_year-merged_nt-seq/327059-2020/327059-2020_dedup_pool_nt-seq.txt'

persistent_number={}
for subsample_count in [1000, 10000, 100000, 200000, 400000, 600000, 800000, 1000000]:
    print(subsample_count)
    persistent_number[str(subsample_count)]=[]
    for i in range(10):
        shuf_cmd1 = 'shuf -n {} {}'.format(str(subsample_count), infile1)
        p = sp.Popen(shuf_cmd1, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout1, stderr1 = p.communicate()

        shuf_cmd2 = 'shuf -n {} {}'.format(str(subsample_count), infile2)
        p = sp.Popen(shuf_cmd2, stdout=sp.PIPE, stderr=sp.PIPE, shell=True, encoding='utf8')
        stdout2, stderr2 = p.communicate()

        tp1_clonotype_subsample=stdout1.split('\n')[:-1]
        tp2_clonotype_subsample=stdout2.split('\n')[:-1]

        clone_counts=Counter(tp1_clonotype_subsample+tp2_clonotype_subsample)

        persistent_clones=[c for c in clone_counts if clone_counts[c]==2]
        persistent_number[str(subsample_count)].append(len(persistent_clones))

with open(f'./data/persistent-clonotypes/327059-{str(subsample_count)}subsample-{str(i)}', 'w') as file:
    file.write('\n'.join(persistent_clones))

In [None]:
pdata=[]
for i in persistent_number.keys():
    data=persistent_number[i]
    data=[_i/int(i) for _i in data]
    intervals=st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    pdata.append({'size':i, 
                  'average': np.mean(data)*100, 
                  'upper': intervals[1]*100, 
                  'lower': intervals[0]*100})
pdf=pd.DataFrame(pdata)

In [None]:
pdf.to_csv('./data/persistent-sequences/327059-df.csv')