In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from random import shuffle
from skbio.diversity.alpha import michaelis_menten_fit

In [2]:
# Number of repeats, global variable
rep = 10

In [3]:
cloneFile = "cdr3-clones-GC-IGH_MOUSE-after-reassignment.csv"
pdfFile = cloneFile.replace("cdr3-", "saturation-")
pdfFile = pdfFile.replace(".csv", ".pdf")
excelFile = pdfFile.replace(".pdf", ".xlsx")
df = pd.read_csv(cloneFile, sep="\t")
df.head()

Unnamed: 0,Sample,MID,cdr3pep,freq,uniq_umis,V_sub,J_sub,sum_sites,avg_sites,read_perc,umi_perc
0,MSmm-01_S1,nomatch,CARGGLYWYFDVWGTGTTLT,59,1,IGHV5-17,IGHJ1,1,0.016949,0.28751,0.009177
1,MSmm-01_S1,nomatch,CARRAVVATPYWYFDVWGTGTLLT,43,1,IGHV5-17,IGHJ1,0,0.0,0.209541,0.009177
2,MSmm-01_S1,nomatch,CKTHDGYYSYWYFDVWGTGTLLT,40,1,IGHV5-2,IGHJ1,0,0.0,0.194922,0.009177
3,MSmm-01_S1,nomatch,CARRAVVATPYWYFDVWGTGTPLT,38,1,IGHV5-17+IGHV5-4,IGHJ1,0,0.0,0.185176,0.009177
4,MSmm-01_S1,nomatch,YWYFDVWGTGTLLT,37,1,IGHV5-17+IGHV5-4,IGHJ1,0,0.0,0.180303,0.009177


In [4]:
samples = list(df["Sample"].unique())
samples

['MSmm-01_S1', 'MSmm-02_S2', 'MSmm-03_S3', 'MSmm-04_S4', 'MSmm-05_S5']

# Calculate Michaelis-Menten fit to rarefaction curve of observed OTUs
http://scikit-bio.org/docs/0.2.2/generated/generated/skbio.diversity.alpha.michaelis_menten_fit.html#skbio.diversity.alpha.michaelis_menten_fit

In [5]:
writer = pd.ExcelWriter(excelFile, engine='xlsxwriter')

S_max_values = list()
for sample in samples:
    df_s = df[df["Sample"] == sample]
    counts = df_s['freq'].to_list()
    S_max_values.append(michaelis_menten_fit(counts, num_repeats=rep))
df_rarefaction = pd.DataFrame({'Sample': samples, 'Smax': S_max_values})
df_rarefaction.to_excel(writer, sheet_name="Rarefaction")

# Make rarefaction curves

In [6]:
def saturationFigure(pdf, df, sample, repeats=1):
    # clone definition is: V, J, CDR3pep
    # Let op! Geimplementeerd als samplen met teruglegging (to discuss)
    
    # Get data for one sample
    df_s = df[df["Sample"] == sample]
    
    # Create a list with one entry per read
    reads = list()
    for index, row in df_s.iterrows():
        newlist = [index] * row['freq']
        reads = reads + newlist

    # Go from 0 to N and count nr of unique clones
    data = pd.DataFrame({'fraction': [x for x in range(len(reads))]})
    for i in range(repeats):
        fraction = list()
        clones = list()

        # Shuffle list and transform list to dataframe
        shuffle(reads)
        
        for x in range(len(reads)):
            sampled = reads[:x]
            fraction.append(x)
            clones.append(len(set(sampled)))

        # Make figure
        plt.plot(fraction, clones)
        
        # Add to data
        data[i] = clones

    # Add title, labels and save the figure
    plt.title("Sample: " + sample + ", repeats=" + str(repeats))
    plt.xlabel("fraction of sample")
    plt.ylabel("unique clones")
    pdf.savefig()
    plt.close()
    
    return(data)

In [7]:
# info multipage pdf: https://matplotlib.org/3.1.1/gallery/misc/multipage_pdf.html
with PdfPages(pdfFile) as pdf:
    for sample in samples:
        df_tmp = saturationFigure(pdf, df, sample, repeats=rep)
        df_tmp.to_excel(writer, sheet_name=sample)
print("Wrote", pdfFile, "to disk")

Wrote saturation-clones-GC-IGH_MOUSE-after-reassignment.pdf to disk


In [8]:
writer.save()
print("Wrote", excelFile, "to disk")

Wrote saturation-clones-GC-IGH_MOUSE-after-reassignment.xlsx to disk
