In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from random import sample 

In [2]:
cloneFile = "cdr3-clones-GC-IGH_HUMAN-after-reassignment.csv"
pdfFile = cloneFile.replace("cdr3-", "saturation-")
pdfFile = pdfFile.replace(".csv", ".pdf")
excelFile = pdfFile.replace(".pdf", ".xlsx")
df = pd.read_csv(cloneFile, sep="\t")
df.head()

Unnamed: 0,Sample,MID,cdr3pep,freq,uniq_umis,V_sub,J_sub,sum_sites,avg_sites,read_perc,umi_perc
0,MS-200109-1_S1,nomatch,CAHGMARPRQWDTVVDFDYWGQGTLVT,595,1,IGHV2-5,IGHJ4,38,0.063866,0.844655,0.006668
1,MS-200109-1_S1,nomatch,CARSLDPLDFQHWGQGTLVT,310,1,IGHV1-18,"IGHJ1,IGHJ4",29,0.093548,0.440072,0.006668
2,MS-200109-1_S1,nomatch,CARGRGSGSWNFDYWGQGTLVT,262,1,IGHV1-8,IGHJ4,199,0.759542,0.371932,0.006668
3,MS-200109-1_S1,nomatch,CAHSSVVIVLHAFDIWGQGTMVT,240,1,IGHV2-5,IGHJ3,15,0.0625,0.340701,0.006668
4,MS-200109-1_S1,nomatch,CARPSSSYSSSLDYWGQGTLVT,220,1,IGHV5-51,IGHJ4,19,0.086364,0.312309,0.006668


In [3]:
samples = list(df["Sample"].unique())
samples

['MS-200109-1_S1',
 'MS-200109-2_S2',
 'MS-200109-3_S3',
 'MS-200109-4_S4',
 'MS-200109-5_S5',
 'MS-200109-6_S6',
 'MS-200109-7_S7',
 'MS-200109-8_S8']

In [4]:
def saturationFigure(pdf, df, sample):
    # clone definition is: V, J, CDR3pep
    # Let op! Geimplementeerd als samplen met teruglegging (to discuss)
    
    # Get data for one sample
    df_s = df[df["Sample"] == sample]
    
    # Create a list with one entry per read
    reads = list()
    for index, row in df_s.iterrows():
        newlist = [index] * row['freq']
        reads = reads + newlist
    reads = pd.DataFrame({"clones": reads})
    
    # Sample from the list
    fraction = list()
    clones = list()
    for x in np.arange(0, 1.1, 0.1):
        df_sampled = reads.sample(frac=x)
        fraction.append(x)
        clones.append(len(df_sampled['clones'].unique()))

    # Make figure
    plt.scatter(fraction, clones)
    plt.title("Sample: " + sample)
    plt.xlabel("fraction of sample")
    plt.ylabel("unique clones")
    pdf.savefig()
    plt.close()
    
    # Put data in dataframe and return it
    data = pd.DataFrame({'fraction': fraction, 'unique_clones': clones})
    return(data)

In [5]:
# info multipage pdf: https://matplotlib.org/3.1.1/gallery/misc/multipage_pdf.html
with PdfPages(pdfFile) as pdf:
    writer = pd.ExcelWriter(excelFile, engine='xlsxwriter')
    for sample in samples:
        df_tmp = saturationFigure(pdf, df, sample)
        df_tmp.to_excel(writer, sheet_name=sample)
    writer.save()
print("Wrote", pdfFile, "to disk")
print("Wrote", excelFile, "to disk")

Wrote saturation-clones-GC-IGH_HUMAN-after-reassignment.pdf to disk
Wrote saturation-clones-GC-IGH_HUMAN-after-reassignment.xlsx to disk
