In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from random import shuffle

In [2]:
cloneFile = "cdr3-clones-Vasculitis-IGH_HUMAN-after-reassignment.csv"
pdfFile = cloneFile.replace("cdr3-", "saturation-")
pdfFile = pdfFile.replace(".csv", ".pdf")
excelFile = pdfFile.replace(".pdf", ".xlsx")
df = pd.read_csv(cloneFile, sep="\t")
df.head()

Unnamed: 0,Sample,MID,cdr3pep,freq,uniq_umis,V_sub,J_sub,sum_sites,avg_sites,read_perc,umi_perc
0,VC015V3-IgG-LIN_S23,ACGTACGT,CVRYTRIKDFWSGPIDWGQGALVT,141,84,IGHV1-8,IGHJ4,139,0.985816,51.086957,39.069767
1,VC015V3-IgG-LIN_S23,ACGTACGT,CAREDTMIQGGLDYWGQGTLVT,30,29,IGHV3-7,IGHJ4,0,0.0,10.869565,13.488372
2,VC015V3-IgG-LIN_S23,ACGTACGT,CARRPALSGFDRPVYFDSWGQGTLVT,29,28,IGHV5-51,IGHJ4,0,0.0,10.507246,13.023256
3,VC015V3-IgG-LIN_S23,ACGTACGT,CARHDAAAAEHFAYWGQGTLVT,10,10,IGHV4-39,IGHJ4,0,0.0,3.623188,4.651163
4,VC015V3-IgG-LIN_S23,ACGTACGT,CAKEGVRGYFPDWGQGTLVT,9,9,IGHV3-23,IGHJ4,1,0.111111,3.26087,4.186047


In [3]:
samples = list(df["Sample"].unique())
samples

['VC015V3-IgG-LIN_S23', 'VC018V2-IgG-LIN_S22']

In [22]:
def saturationFigure(pdf, df, sample, repeats=1):
    # clone definition is: V, J, CDR3pep
    # Let op! Geimplementeerd als samplen met teruglegging (to discuss)
    
    # Get data for one sample
    df_s = df[df["Sample"] == sample]
    
    # Create a list with one entry per read
    reads = list()
    for index, row in df_s.iterrows():
        newlist = [index] * row['freq']
        reads = reads + newlist

    # Go from 0 to N and count nr of unique clones
    data = pd.DataFrame({'fraction': [x for x in range(len(reads))]})
    for i in range(repeats):
        fraction = list()
        clones = list()

        # Shuffle list and transform list to dataframe
        shuffle(reads)
        
        for x in range(len(reads)):
            sampled = reads[:x]
            fraction.append(x)
            clones.append(len(set(sampled)))

        # Make figure
        plt.plot(fraction, clones)
        
        # Add to data
        data[i] = clones

    # Add title, labels and save the figure
    plt.title("Sample: " + sample + ", repeats=" + str(repeats))
    plt.xlabel("fraction of sample")
    plt.ylabel("unique clones")
    pdf.savefig()
    plt.close()
    
    return(data)

In [23]:
# info multipage pdf: https://matplotlib.org/3.1.1/gallery/misc/multipage_pdf.html
with PdfPages(pdfFile) as pdf:
    writer = pd.ExcelWriter(excelFile, engine='xlsxwriter')
    for sample in samples:
        df_tmp = saturationFigure(pdf, df, sample, repeats=10)
        df_tmp.to_excel(writer, sheet_name=sample)
    writer.save()
print("Wrote", pdfFile, "to disk")
print("Wrote", excelFile, "to disk")

Wrote saturation-clones-Vasculitis-IGH_HUMAN-after-reassignment.pdf to disk
Wrote saturation-clones-Vasculitis-IGH_HUMAN-after-reassignment.xlsx to disk
