In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
outfile = "runTenXTest-IGHKL-HUMAN-shared-clones.xlsx"

## Read data

In [3]:
myfiles = [x for x in os.listdir(".") if "airr" in x]
myfiles

['sc5p_v2_hs_B_1k_multi_5gex_b_vdj_b_airr_rearrangement.tsv']

In [4]:
cols = ['Sample', 'cdr3pep', 'freq']
df = pd.read_csv(myfiles[0], sep="\t")
df["Sample"] = df["cell_id"] # Consider each cell as a different sample
df["cdr3pep"] = df["junction_aa"] # The CDR3 peptide sequence
df["freq"] = df["consensus_count"] # use the frequency
df = df[cols]
df.head()

Unnamed: 0,Sample,cdr3pep,freq
0,AAACCTGAGGGCTCTC-1,CARGDSSGWRGGNWFDPW,889
1,AAACCTGAGGGCTCTC-1,CQSYDSSLSDVF,5214
2,AAACCTGGTAAGGATT-1,CQQYYDTPRTF,2099
3,AAACCTGGTAAGGATT-1,CAMGYCINNNCYEGWFDPW,1410
4,AAACCTGGTAATAGCA-1,CARAKRWGYSSSWCDYW,3982


In [5]:
for myfile in myfiles[1:]:
    df_tmp = pd.read_csv(myfile, sep="\t")
    df_tmp["Sample"] = df_tmp["cell_id"] # Consider each cell as a different sample
    df_tmp["cdr3pep"] = df_tmp["junction_aa"] # The CDR3 peptide sequence
    df_tmp["freq"] = df_tmp["consensus_count"] # use the frequency
    df_tmp = df_tmp[cols]
    df = df.append(df_tmp)
df.tail()

Unnamed: 0,Sample,cdr3pep,freq
2232,TTTGGTTCATCTCGCT-1,CARARRNFGLVIDYW,470
2233,TTTGGTTCATTAGGCT-1,CAVWDDSLSGRVF,2142
2234,TTTGGTTCATTAGGCT-1,CARDEVAGREFW,191
2235,TTTGTCAAGTGTTTGC-1,CARKNYYDSSGYYPDAFDIW,1291
2236,TTTGTCAAGTGTTTGC-1,CQQSYSTPPNTF,2655


In [6]:
# Use the Cell id as Sample and Patient
df["Patient"] = df["Sample"]

In [7]:
df = df.groupby(["Patient", "Sample", "cdr3pep"]).agg({'freq': sum})
df = df.reset_index()
df.head()

Unnamed: 0,Patient,Sample,cdr3pep,freq
0,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CARGDSSGWRGGNWFDPW,889
1,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CQSYDSSLSDVF,5214
2,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CAMGYCINNNCYEGWFDPW,1410
3,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CQQYYDTPRTF,2099
4,AAACCTGGTAATAGCA-1,AAACCTGGTAATAGCA-1,CAAWDDSLNGGVF,3120


In [8]:
df_total = df.groupby("Sample").agg({"freq": sum})
df_total = df_total.reset_index()
df_total = df_total.rename(columns={'freq': 'totalreads'})
df_total.head()

Unnamed: 0,Sample,totalreads
0,AAACCTGAGGGCTCTC-1,6103
1,AAACCTGGTAAGGATT-1,3509
2,AAACCTGGTAATAGCA-1,7102
3,AAACCTGGTACGCACC-1,6578
4,AAACCTGTCCAACCAA-1,11092


In [9]:
df = pd.merge(df, df_total, on="Sample")
df.head()

Unnamed: 0,Patient,Sample,cdr3pep,freq,totalreads
0,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CARGDSSGWRGGNWFDPW,889,6103
1,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CQSYDSSLSDVF,5214,6103
2,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CAMGYCINNNCYEGWFDPW,1410,3509
3,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CQQYYDTPRTF,2099,3509
4,AAACCTGGTAATAGCA-1,AAACCTGGTAATAGCA-1,CAAWDDSLNGGVF,3120,7102


In [10]:
df.tail()

Unnamed: 0,Patient,Sample,cdr3pep,freq,totalreads
2232,TTTGGTTCATCTCGCT-1,TTTGGTTCATCTCGCT-1,CLLYYGGADYVF,3530,4000
2233,TTTGGTTCATTAGGCT-1,TTTGGTTCATTAGGCT-1,CARDEVAGREFW,191,2333
2234,TTTGGTTCATTAGGCT-1,TTTGGTTCATTAGGCT-1,CAVWDDSLSGRVF,2142,2333
2235,TTTGTCAAGTGTTTGC-1,TTTGTCAAGTGTTTGC-1,CARKNYYDSSGYYPDAFDIW,1291,3946
2236,TTTGTCAAGTGTTTGC-1,TTTGTCAAGTGTTTGC-1,CQQSYSTPPNTF,2655,3946


In [11]:
df['perc'] = 100 * df['freq'] / df['totalreads']
df.head()

Unnamed: 0,Patient,Sample,cdr3pep,freq,totalreads,perc
0,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CARGDSSGWRGGNWFDPW,889,6103,14.566607
1,AAACCTGAGGGCTCTC-1,AAACCTGAGGGCTCTC-1,CQSYDSSLSDVF,5214,6103,85.433393
2,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CAMGYCINNNCYEGWFDPW,1410,3509,40.182388
3,AAACCTGGTAAGGATT-1,AAACCTGGTAAGGATT-1,CQQYYDTPRTF,2099,3509,59.817612
4,AAACCTGGTAATAGCA-1,AAACCTGGTAATAGCA-1,CAAWDDSLNGGVF,3120,7102,43.931287


## Determine overlap (shared clones) and the direction

In [12]:
ids = list(df['Sample'].unique())
ids[:10]

['AAACCTGAGGGCTCTC-1',
 'AAACCTGGTAAGGATT-1',
 'AAACCTGGTAATAGCA-1',
 'AAACCTGGTACGCACC-1',
 'AAACCTGTCCAACCAA-1',
 'AAACCTGTCTATGTGG-1',
 'AAACGGGCACACAGAG-1',
 'AAACGGGTCCAGTAGT-1',
 'AAACGGGTCGCAGGCT-1',
 'AAAGATGAGATCTGCT-1']

In [13]:
def directionShared(df, idA, idB):
    df_A = df[df['Sample'] == idA]
    df_B = df[df['Sample'] == idB]
    df_pair = pd.merge(df_A, df_B, on="cdr3pep")
    sum_A = df_pair["perc_x"].sum()
    sum_B = df_pair["perc_y"].sum()
    if sum_B > sum_A:
        idA, idB = idB, idA
        sum_A, sum_B = sum_B, sum_A
    return(idA, idB, sum_A, sum_B)
directionShared(df, "DO-004V3-P5_S123", "DO-004V3-P8_S127")

('DO-004V3-P5_S123', 'DO-004V3-P8_S127', 0.0, 0.0)

In [14]:
sources = list()
targets = list()
impacts_source = list()
impacts_target = list()
for i in range(len(ids)-1):
    for j in range(i+1, len(ids)):
        idA = ids[i]
        idB = ids[j]
        (source, target, impact_source, impact_target) = directionShared(df, idA, idB)
        sources.append(source)
        targets.append(target)
        impacts_source.append(impact_source)
        impacts_target.append(impact_target)
df_shared = pd.DataFrame({'source': sources, 'target': targets, 'impact_source': impacts_source, 'impact_target': impacts_target})
df_shared.head()

Unnamed: 0,source,target,impact_source,impact_target
0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1,0.0,0.0
1,AAACCTGAGGGCTCTC-1,AAACCTGGTAATAGCA-1,0.0,0.0
2,AAACCTGAGGGCTCTC-1,AAACCTGGTACGCACC-1,0.0,0.0
3,AAACCTGAGGGCTCTC-1,AAACCTGTCCAACCAA-1,0.0,0.0
4,AAACCTGAGGGCTCTC-1,AAACCTGTCTATGTGG-1,0.0,0.0


In [16]:
# Add patient columns for the source and the target
cols = ["Patient", "Sample"]
df_sample_patient = df[cols]
df_shared = pd.merge(df_shared, df_sample_patient, left_on="source", right_on="Sample").rename(columns={"Patient": "patient_source", "Sample": "sample_source"})
df_shared = pd.merge(df_shared, df_sample_patient, left_on="target", right_on="Sample").rename(columns={"Patient": "patient_target", "Sample": "sample_target"})
cols = ["source", "target", "impact_source", "impact_target", "patient_source", "patient_target"]
df_shared = df_shared[cols]
df_shared.head()

Unnamed: 0,source,target,impact_source,impact_target,patient_source,patient_target
0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1,0.0,0.0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1
1,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1,0.0,0.0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1
2,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1,0.0,0.0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1
3,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1,0.0,0.0,AAACCTGAGGGCTCTC-1,AAACCTGGTAAGGATT-1
4,AAACCTGAGGGCTCTC-1,AAACCTGGTAATAGCA-1,0.0,0.0,AAACCTGAGGGCTCTC-1,AAACCTGGTAATAGCA-1


In [17]:
# sort on impact on the target and then on impact of the source
df_shared = df_shared.sort_values(by=["impact_target", "impact_source"], ascending=False)
df_shared = df_shared.reset_index()
df_shared.head()

Unnamed: 0,index,source,target,impact_source,impact_target,patient_source,patient_target
0,962359,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1
1,962360,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1
2,962361,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1
3,962362,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1
4,2059270,ACGCCGACAGTACACT-1,TGAGAGGGTCAGAGGT-1,100.0,100.0,ACGCCGACAGTACACT-1,TGAGAGGGTCAGAGGT-1


In [18]:
same_pt = list()
for i in range(len(df_shared)):
    if df_shared['patient_source'][i] == df_shared['patient_target'][i]:
        same_pt.append("same")
    else:
        same_pt.append("different")
df_shared["Same patient"] = same_pt
df_shared.head()

Unnamed: 0,index,source,target,impact_source,impact_target,patient_source,patient_target,Same patient
0,962359,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,different
1,962360,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,different
2,962361,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,different
3,962362,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,100.0,100.0,AGATTGCCATCTCCCA-1,GCGCAACCAAGAAAGG-1,different
4,2059270,ACGCCGACAGTACACT-1,TGAGAGGGTCAGAGGT-1,100.0,100.0,ACGCCGACAGTACACT-1,TGAGAGGGTCAGAGGT-1,different


In [None]:
df_shared.to_excel(outfile)
print("Wrote", outfile, "to disk")

## Visualize it

In [None]:
df_shared = df_shared[(df_shared["impact_source"] != 0) & (df_shared["impact_target"] != 0)]
df_shared = df_shared.reset_index()
df_shared.head()

In [None]:
def makeScatter(x,y,txt,color,xlabel,ylabel,plotfile):
    fig, ax = plt.subplots(figsize=(15,15))
    ax.scatter(x, y, color=color)

    for i, txt in enumerate(txt):
        ax.annotate(txt, (x[i], y[i]))
        
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    # Add red dotted line
    ax.plot([-1, 100], [-1, 100], c='w', linestyle=":")
    
    # Save figure to file
    fig.savefig(plotfile)
    print("Wrote", plotfile, "to disk")

In [None]:
f = lambda x,y: x + "->" + y
df_shared['txt'] = [x for x in map(f, df_shared['source'], df_shared['target'])]
df_shared['color'] = "blue"
df_shared.loc[df_shared["Same patient"] == "same", 'color'] = "purple"
df_shared.head()

In [None]:
plotfile = outfile.replace(".xlsx", "-all.pdf")
makeScatter(df_shared["impact_source"], df_shared["impact_target"], df_shared['txt'], df_shared['color'], "impact source", "impact target", plotfile)

In [None]:
# Only show things that are different
df_shared_filter = df_shared.copy()
df_shared_filter = df_shared_filter[df_shared_filter["Same patient"] == "different"]
cols = ["source", "target", "impact_source", "impact_target", "Same patient", "txt", "color"]
df_shared_filter = df_shared_filter[cols]
df_shared_filter = df_shared_filter.reset_index()
df_shared_filter.head()

In [None]:
plotfile = outfile.replace(".xlsx", "-different-ids.pdf")
makeScatter(df_shared_filter["impact_source"], df_shared_filter["impact_target"], df_shared_filter['txt'], df_shared_filter['color'], "impact source", "impact target", plotfile)

## Calculate common/public clones

In [None]:
df.head()

In [None]:
concat = lambda x: ", ".join(list(set(x)))
df_clone_occurrence = df.groupby('cdr3pep').agg({'Patient': [pd.Series.nunique, concat], 'Sample': [pd.Series.nunique, concat]})

In [None]:
df_clone_occurrence = df_clone_occurrence.rename(columns={'<lambda>': 'names'})
df_clone_occurrence.head()

In [None]:
# Get percentage columns for each sample
df_pivot = df.pivot_table(index='cdr3pep', columns=['Patient','Sample'], values='perc', aggfunc='sum')
df_pivot = df_pivot.reset_index()
df_pivot.head()

In [None]:
df_clone_occurrence = pd.merge(df_clone_occurrence, df_pivot, on='cdr3pep')
df_clone_occurrence = df_clone_occurrence.sort_values(by=[('Patient','nunique'), ('Sample','nunique')], ascending=False)
df_clone_occurrence = df_clone_occurrence.reset_index()
df_clone_occurrence.head()

In [None]:
# Only keep entries that are in 2 or more Samples
print("All cdr3's:", len(df_clone_occurrence))
df_clone_occurrence = df_clone_occurrence[df_clone_occurrence[('Sample','nunique')] > 1]
print("cdr3's in > 1 sample:", len(df_clone_occurrence))

In [None]:
occurrence_file = outfile.replace(".xlsx", "-occurrence.xlsx")
df_clone_occurrence.to_excel(occurrence_file)
print("Wrote", occurrence_file, "to disk")