In [1]:
import pandas as pd
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.les_mis import data
from sklearn.metrics.pairwise import pairwise_distances

hv.extension('bokeh')
hv.extension('matplotlib')
hv.output(size=400)

## Try with mock data

In [2]:
samples = ['S1','S1','S2','S2','S3','S3','S4','S4','S4','S5','S5','S5','S6','S6','S6']
cdr3 = ['d','e','e','a','f','g','f','a','d','a','b','h','c','b','a']
freq = [10,5,20,2,5,1,9,3,1,30,10,2,30,20,10]
df = pd.DataFrame({'Sample': samples, 'cdr3pep': cdr3, 'freq': freq})
df.head()

Unnamed: 0,Sample,cdr3pep,freq
0,S1,d,10
1,S1,e,5
2,S2,e,20
3,S2,a,2
4,S3,f,5


In [3]:
df_pivot = df.pivot_table(columns='Sample', index='cdr3pep', values='freq')
df_pivot = df_pivot.fillna(0)
df_pivot.head()

Sample,S1,S2,S3,S4,S5,S6
cdr3pep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,0.0,2.0,0.0,3.0,30.0,10.0
b,0.0,0.0,0.0,0.0,10.0,20.0
c,0.0,0.0,0.0,0.0,0.0,30.0
d,10.0,0.0,0.0,1.0,0.0,0.0
e,5.0,20.0,0.0,0.0,0.0,0.0


In [4]:
sim = 1 - pairwise_distances(df_pivot.T, metric = "braycurtis")
sim = pd.DataFrame(sim, index=df_pivot.columns, columns=df_pivot.columns)
sim.head()

Sample,S1,S2,S3,S4,S5,S6
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
S1,1.0,0.27027,0.0,0.071429,0.0,0.0
S2,0.27027,1.0,0.0,0.114286,0.0625,0.04878
S3,0.0,0.0,1.0,0.526316,0.0,0.0
S4,0.071429,0.114286,0.526316,1.0,0.109091,0.082192
S5,0.0,0.0625,0.0,0.109091,1.0,0.392157


In [5]:
sim = sim * 100
sim.head()

Sample,S1,S2,S3,S4,S5,S6
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
S1,100.0,27.027027,0.0,7.142857,0.0,0.0
S2,27.027027,100.0,0.0,11.428571,6.25,4.878049
S3,0.0,0.0,100.0,52.631579,0.0,0.0
S4,7.142857,11.428571,52.631579,100.0,10.909091,8.219178
S5,0.0,6.25,0.0,10.909091,100.0,39.215686


In [6]:
# Nodes
samples = list(sim.columns)
#group = [1,1,2,2,3,3]
group = ['a','a','b','b','c','c']
inx = [x for x in range(len(samples))]
nodes = pd.DataFrame({'index': inx, 'group': group, 'name': samples})
nodes.head()

Unnamed: 0,index,group,name
0,0,a,S1
1,1,a,S2
2,2,b,S3
3,3,b,S4
4,4,c,S5


In [7]:
sources = list()
targets = list()
values = list()
for i in range(0, len(nodes) - 1):
    for j in range(i + 1, len(nodes)):
        sources.append(nodes['index'][i])
        targets.append(nodes['index'][j])
        values.append(int(sim[nodes['name'][i]][nodes['name'][j]]))
links = pd.DataFrame({'source': sources, 'target': targets, 'value': values})
links.head()

Unnamed: 0,source,target,value
0,0,1,27
1,0,2,0
2,0,3,7
3,0,4,0
4,0,5,0


In [8]:
nodes = hv.Dataset(nodes, 'index')
nodes.data.head()

Unnamed: 0,index,group,name
0,0,a,S1
1,1,a,S2
2,2,b,S3
3,3,b,S4
4,4,c,S5


In [9]:
chord = hv.Chord((links, nodes)).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source').str(), 
               labels='name', node_color=dim('index').str()))

## With real dataset

In [10]:
df = pd.read_csv("cdr3-clones-GC-IGH_HUMAN-after-reassignment.csv", sep="\t")
df.head()

Unnamed: 0,Sample,MID,cdr3pep,freq,uniq_umis,V_sub,J_sub,sum_sites,avg_sites,read_perc,umi_perc
0,MS-200109-1_S1,nomatch,CAHGMARPRQWDTVVDFDYWGQGTLVT,595,1,IGHV2-5,IGHJ4,38,0.063866,0.844655,0.006668
1,MS-200109-1_S1,nomatch,CARSLDPLDFQHWGQGTLVT,310,1,IGHV1-18,"IGHJ1,IGHJ4",29,0.093548,0.440072,0.006668
2,MS-200109-1_S1,nomatch,CARGRGSGSWNFDYWGQGTLVT,262,1,IGHV1-8,IGHJ4,199,0.759542,0.371932,0.006668
3,MS-200109-1_S1,nomatch,CAHSSVVIVLHAFDIWGQGTMVT,240,1,IGHV2-5,IGHJ3,15,0.0625,0.340701,0.006668
4,MS-200109-1_S1,nomatch,CARPSSSYSSSLDYWGQGTLVT,220,1,IGHV5-51,IGHJ4,19,0.086364,0.312309,0.006668


In [11]:
samples = list(df["Sample"].unique())
samples

['MS-200109-1_S1',
 'MS-200109-2_S2',
 'MS-200109-3_S3',
 'MS-200109-4_S4',
 'MS-200109-5_S5',
 'MS-200109-6_S6',
 'MS-200109-7_S7',
 'MS-200109-8_S8']

In [12]:
def circos_vj(sample, df):
    # Select sample from dataframe
    df_selection = df[df["Sample"] == sample]

    # Create the "nodes" dataframe (V and J)
    nodesV = df_selection.groupby("V_sub").agg({"freq": len})
    nodesV = nodesV.reset_index()
    nodesV = nodesV.rename(columns={"V_sub": "name", "freq": "group"})
    nodesV["group"] = "V"
    nodesJ = df_selection.groupby("J_sub").agg({"freq": len})
    nodesJ = nodesJ.reset_index()
    nodesJ = nodesJ.rename(columns={"J_sub": "name", "freq": "group"})
    nodesJ["group"] = "J"
    nodes = pd.concat([nodesV, nodesJ])
    nodes = nodes.reset_index()
    nodes['index'] = list(range(len(nodes)))
    
    nodes = hv.Dataset(nodes, 'index')

    # Check how many times a V-J combination was found (not frequency, but in terms of clones)
    links = df_selection.groupby(by=["V_sub", "J_sub"]).agg({"freq": len})
    links = links.reset_index()
    links = links.rename(columns={'V_sub': 'source', 'J_sub': 'target', 'freq': 'value'})

    # Convert source and target in the links dataframe to numbers (from the nodes dataframe)
    lookup = lambda x: int(nodes[nodes["name"] == x]["index"])
    links["source"] = [x for x in map(lookup, links["source"])]
    links["target"] = [x for x in map(lookup, links["target"])]

    # Make the Circos figure
    chord = hv.Chord((links, nodes)).select(value=(5, None))
    chord.opts(
        opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source').str(), 
                   labels='name', node_color=dim('index').str()))
    hv.save(chord, sample + '.png', fmt='png')
    print("Wrote", sample + '.png', "to disk")

In [13]:
for sample in samples:
    circos_vj(sample, df)

Wrote MS-200109-1_S1.png to disk
Wrote MS-200109-2_S2.png to disk
Wrote MS-200109-3_S3.png to disk
Wrote MS-200109-4_S4.png to disk
Wrote MS-200109-5_S5.png to disk
Wrote MS-200109-6_S6.png to disk
Wrote MS-200109-7_S7.png to disk
Wrote MS-200109-8_S8.png to disk
