In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from random import shuffle
from skbio.diversity.alpha import michaelis_menten_fit



ValueError: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 192 from PyObject

In [3]:
# Number of repeats, global variable
rep = 10

In [4]:
cloneFile = "cdr3-clones-GC-IGH_HUMAN-after-reassignment.csv"
pdfFile = cloneFile.replace("cdr3-", "saturation-")
pdfFile = pdfFile.replace(".csv", ".pdf")
excelFile = pdfFile.replace(".pdf", ".xlsx")
df = pd.read_csv(cloneFile, sep="\t")
df.head()

ImportError: cannot import name 'is_url'

     Sample      MID                          cdr3pep  freq  uniq_umis  \
0  MS-76_S1  nomatch       CARDPLGHWYDDSGTGGYWGQGTLVT  7268          1   
1  MS-76_S1  nomatch        CAPCTGFGSGWSDAFDIWGQGTMVT  5458          1   
2  MS-76_S1  nomatch     CARDVGGDTSWYNPGPGMDVWGQGTTVT  4455          1   
3  MS-76_S1  nomatch             CARHQLEVYFDYWGQGTLVT  3571          1   
4  MS-76_S1  nomatch  CARGQIVRFRDFTTRDKLTWFDSWGQGTLVT  2256          1   

      V_sub        J_sub  sum_sites  avg_sites  read_perc  umi_perc  
0   IGHV1-2  IGHJ4,IGHJ1        750   0.103192   7.159110  0.054855  
1   IGHV2-5        IGHJ3          6   0.001099   5.376228  0.054855  
2   IGHV1-2        IGHJ6        444   0.099663   4.388255  0.054855  
3  IGHV5-51        IGHJ4         32   0.008961   3.517499  0.054855  
4   IGHV1-8        IGHJ5        624   0.276596   2.222200  0.054855  

In [4]:
samples = list(df["Sample"].unique())
samples

['VC015V3-IgG-LIN_S23', 'VC018V2-IgG-LIN_S22']

# Calculate Michaelis-Menten fit to rarefaction curve of observed OTUs
http://scikit-bio.org/docs/0.2.2/generated/generated/skbio.diversity.alpha.michaelis_menten_fit.html#skbio.diversity.alpha.michaelis_menten_fit

In [5]:
writer = pd.ExcelWriter(excelFile, engine='xlsxwriter')

S_max_values = list()
for sample in samples:
    df_s = df[df["Sample"] == sample]
    counts = df_s['freq'].to_list()
    S_max_values.append(michaelis_menten_fit(counts, num_repeats=rep))
df_rarefaction = pd.DataFrame({'Sample': samples, 'Smax': S_max_values})
df_rarefaction.to_excel(writer, sheet_name="Rarefaction")

# Make rarefaction curves

In [6]:
def saturationFigure(pdf, df, sample, repeats=1):
    # clone definition is: V, J, CDR3pep
    # Let op! Geimplementeerd als samplen met teruglegging (to discuss)
    
    # Get data for one sample
    df_s = df[df["Sample"] == sample]
    
    # Create a list with one entry per read
    reads = list()
    for index, row in df_s.iterrows():
        newlist = [index] * row['freq']
        reads = reads + newlist

    # Go from 0 to N and count nr of unique clones
    data = pd.DataFrame({'fraction': [x for x in range(len(reads))]})
    for i in range(repeats):
        fraction = list()
        clones = list()

        # Shuffle list and transform list to dataframe
        shuffle(reads)
        
        for x in range(len(reads)):
            sampled = reads[:x]
            fraction.append(x)
            clones.append(len(set(sampled)))

        # Make figure
        plt.plot(fraction, clones)
        
        # Add to data
        data[i] = clones

    # Add title, labels and save the figure
    plt.title("Sample: " + sample + ", repeats=" + str(repeats))
    plt.xlabel("fraction of sample")
    plt.ylabel("unique clones")
    pdf.savefig()
    plt.close()
    
    return(data)

In [7]:
# info multipage pdf: https://matplotlib.org/3.1.1/gallery/misc/multipage_pdf.html
with PdfPages(pdfFile) as pdf:
    for sample in samples:
        df_tmp = saturationFigure(pdf, df, sample, repeats=rep)
        df_tmp.to_excel(writer, sheet_name=sample)
print("Wrote", pdfFile, "to disk")

Wrote saturation-clones-Vasculitis-IGH_HUMAN-after-reassignment.pdf to disk


In [8]:
writer.save()
print("Wrote", excelFile, "to disk")

Wrote saturation-clones-Vasculitis-IGH_HUMAN-after-reassignment.xlsx to disk
