In [9]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# load and clean up data

In [26]:
cn_profiles_f = '/Users/haochen/Desktop/Tapestri_analysis/Tapestri_data_batch2/analysis/copy_number/NB_EM-PS_homdel-CN_calls/RA17_22-homdel_with_normal/nclones=10/RA17_22.unique_cn_clone_profiles.csv'

cn_profiles_df = pd.read_csv(cn_profiles_f, index_col=0)

cn_clone_assignment_f = '/Users/haochen/Desktop/Tapestri_analysis/Tapestri_data_batch2/analysis/copy_number/NB_EM-PS_homdel-CN_calls/RA17_22-homdel_with_normal/nclones=10/RA17_22.sample_sc_clone_assignment.updated.csv'

cn_clone_assignment_df = pd.read_csv(cn_clone_assignment_f, index_col=(0,1))

In [3]:
cn_clone_assignment_df.value_counts()

clone_id
0           9890
5           2781
2           1991
7           1500
3            788
4            410
1              1
6              1
dtype: int64

## remove clones that are too small

In [5]:
clone_prev_vc = cn_clone_assignment_df.value_counts()
total_cells = clone_prev_vc.sum()

threshold = 0.01
# too_small_clones = clone_prev_vc[clone_prev_vc < 0.01*total_cells].index.tolist()
clones_to_keep = sorted(clone_prev_vc.index.get_level_values(0)[clone_prev_vc >= 0.01*total_cells].tolist())
clones_to_keep

[0, 2, 3, 4, 5, 7]

In [8]:
clone_rename_map = {}
for i, orig_clone in enumerate(clones_to_keep):
    clone_rename_map[orig_clone] = i

# clone_rename_map

unique_cn_clone_profiles_df = cn_profiles_df.loc[clones_to_keep]
unique_cn_clone_profiles_df.rename(index=clone_rename_map, inplace=True)

# Draw CN clone genome-wide profile

In [10]:
amp_gene_map_f = '/Users/haochen/Desktop/Tapestri_analysis/copy_number/tap_cn_calling/reference/panel3559_analysis.gene_cytoband.formatted.manual_adjusted.txt'

amp_gene_map_df = pd.read_csv(amp_gene_map_f, index_col = 'amplicon_number', 
sep=None, engine='python')

CHROM_COL_NAME = 'chr'
GENE_COL_NAME = 'gene_name'

# --> 0. need to make sure the amplicons are unique
rename_chrX_map = {'X': '23', 'chrX': '23', }
amp_gene_map_df['chr_num'] = amp_gene_map_df[CHROM_COL_NAME].map(lambda x: x if x not in rename_chrX_map else rename_chrX_map[x]).str.strip('chr').astype(int)
# embed()
amp_gene_map_df.sort_values(by=['chr_num', 'insert_start'], inplace=True)
amp_gene_map_df.drop_duplicates(subset=['chr_num', 'insert_start'], keep='first', inplace=True)
print(f'after removing duplicated amplicons, {amp_gene_map_df.shape[0]} amplicons remain')
unique_amplicon_order = amp_gene_map_df.index.values
gene_names = amp_gene_map_df.loc[unique_amplicon_order, GENE_COL_NAME].values
gene_names_to_plot = pd.Series(gene_names).value_counts()[pd.Series(gene_names).value_counts() >= 3].index # plot gene names covered by at least 3 amplicons

after removing duplicated amplicons, 596 amplicons remain


In [11]:
cluster_labels = unique_cn_clone_profiles_df.index.values
# logging.info(f"identified {len(cluster_labels)} unique clones")
# embed()
cn_clone_palette = dict(zip(cluster_labels, np.array(px.colors.qualitative.Set3)[cluster_labels]))
cluster_colors = [cn_clone_palette[i] for i in cluster_labels]

###########  ----- CN cluster profiles -----  ################
# draw subplots
fig = make_subplots(
        rows=2, cols=2,
        shared_yaxes=True, shared_xaxes=True,
        horizontal_spacing=0.01,
        vertical_spacing=0.01,
        column_widths=[1 / 25, 24 / 25],
        row_heights=[1 / 25, 24 / 25],
        )

# get labels
labs = go.Heatmap(z=cluster_labels,
                    y=np.arange(unique_cn_clone_profiles_df.shape[0]),
                    x=[0] * unique_cn_clone_profiles_df.shape[0],
                    customdata=cluster_labels,
                    colorscale=cluster_colors,
                    hovertemplate='label: %{customdata}<extra></extra>',
                    showlegend=False,
                    showscale=False)
fig.add_trace(labs, row=2, col=1)

# @HZ 12/20/2022: both ticktext and tickvals are needed to manually draw the ticklabels, otherwise it won't show up
fig.layout.yaxis3.ticktext = cluster_labels
fig.layout.yaxis3.tickvals = np.arange(unique_cn_clone_profiles_df.shape[0])

# Draw gene names
# embed()
un_genes, idx_genes, inv_genes, cnts_genes = np.unique(gene_names, return_index=True, return_inverse=True, return_counts=True)
gene_col_binary = [0]
for i in np.arange(1,len(inv_genes)):
    # iterate through inv_genes, get connectivity of amplicons
    if inv_genes[i] == inv_genes[i-1]:
        gene_col_binary.append(gene_col_binary[i-1])
    else:
        gene_col_binary.append(abs(1-gene_col_binary[i-1]))

ticks = (idx_genes + cnts_genes / 2).astype(int)
gene_names_subplot = go.Heatmap(
    z = gene_col_binary,
    x = unique_cn_clone_profiles_df.columns,
    y = [0] * unique_cn_clone_profiles_df.shape[1],
    colorscale = [[0, 'rgb(0,0,0)'], [1, 'rgb(144,144,144)']],
    showlegend=False,
    showscale=False,
)
fig.add_trace(gene_names_subplot, row=1, col=2)

# Draw main heatmap
# labels = np.tile(labels[:, None], (1, unique_cn_clone_profiles_df.shape[1]))
vals = go.Heatmap(
    z=unique_cn_clone_profiles_df,
    y=np.arange(unique_cn_clone_profiles_df.shape[0]),
    x=unique_cn_clone_profiles_df.columns,
    # customdata=labels,
    coloraxis='coloraxis',
    hovertemplate='%{z:.2f}<br>%{x}<extra>%{customdata}</extra>',
    showlegend=False,
    showscale=False
)
fig.add_trace(vals, row=2, col=2)

# draw gene names
genes_ticks = (idx_genes + cnts_genes / 2).astype(int)
    
# embed()

fig.layout.xaxis2.ticktext = [ i if i in gene_names_to_plot else "" for i in gene_names[genes_ticks] ]
fig.layout.xaxis2.tickvals = genes_ticks
fig.layout.xaxis2.tickfont = {
    'size': 8,
}
fig.update_layout({'xaxis2': {'ticklen': 4, 'side': 'top', 'tickangle': -60, 'showticklabels': True}})

# draw chromosome numbers
chromosome_ordered = amp_gene_map_df.loc[unique_amplicon_order, CHROM_COL_NAME].values
un, ind, cnts = np.unique(chromosome_ordered, return_index=True, return_counts=True)
ticks = (ind + cnts / 2).astype(int)

fig.layout.xaxis4.ticktext = chromosome_ordered[ticks]
fig.layout.xaxis4.tickvals = ticks
fig.update_layout({'xaxis4': {'tickangle': -45, 'showticklabels': True}})

for i in ind:
    fig.add_vline(i - 0.5, line_color='lightcyan', line_width=1, row=2, col=2)
    
######################################################
# update color schemes
num_vals = 7
colorscale = [
    (0, 'rgb(163, 163, 163)'), (1/num_vals, 'rgb(163, 163, 163)'), # NA
    (1/num_vals, 'rgb(0,0,0)'), (2/num_vals, 'rgb(0,0,0)'), # 0
    (2/num_vals, 'rgb(7, 95, 237)'), (3/num_vals, 'rgb(7, 95, 237)'), # 1
    (3/num_vals, 'rgb(146, 170, 209)'), (4/num_vals, 'rgb(146, 170, 209)'), # 2 
    (4/num_vals, 'rgb(237, 156, 57)'), (5/num_vals, 'rgb(237, 156, 57)'), # 3
    (5/num_vals, 'rgb(242, 29, 22)'), (6/num_vals, 'rgb(242, 29, 22)'), # 4
    (6/num_vals, 'rgb(202, 82, 250)'), (1, 'rgb(202, 82, 250)') # 5+
    ]

colorbar_ticktext=[str(i) for i in list(range(num_vals-1))]
colorbar_ticktext.insert(0, 'NA')
colorbar_ticktext[-1] += '+'
colorbar_tickvals = [(num_vals-1)/(num_vals*2) * (2*i + 1) - 1 for i in range(num_vals)] 
fig.update_layout(
        coloraxis=dict(
            colorscale=colorscale,
            colorbar_tickvals = colorbar_tickvals,
            colorbar_ticktext = colorbar_ticktext,
            colorbar_title=dict(
                font_size = 10,
                text = 'total_copy_number',
            ),
            cmax=num_vals-2,
            cmin=-1
        ),
        font_family = 'Arial'
        )

In [13]:
fig.write_image(
    '/Users/haochen/Desktop/Tapestri_analysis/Tapestri_data_batch2/analysis/copy_number/NB_EM-PS_homdel-CN_calls/RA17_22-homdel_with_normal/nclones=10/RA17_22.cn_clone_profiles.updated.pdf',
    width=1600,
    height=400,
)

# Draw CN clone per-sample composition plot

In [22]:
cn_clone_assignment_df

Unnamed: 0_level_0,cell_barcode,clone_id
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
lung met (R),cell_206,5
lung met (R),cell_405,7
lung met (R),cell_663,5
lung met (R),cell_906,7
lung met (R),cell_443,2
...,...,...
RA18_18-11_1,cell_742,0
RA18_18-11_1,cell_824,0
RA18_18-11_1,cell_885,0
RA18_18-11_1,cell_539,0


In [55]:
sample_organ_site_map = {
    'RA17_22-06_2': 'lung met (L)', 
    'RA17_22-27_2': 'peritoneum met', 
    'RA17_22-32_1': 'pancreas S2',
    'RA17_22-35_1': 'pancreas S5', 
    'RA17_22-39_6': 'liver met', 
    'RA17_22-42_2': 'retroperitoneum met',
    'RA17_22-11_1': 'diaphragm met',
    'RA17_22-33_2': 'pancreas S3',
    'RA17_22-04_1': 'lung met (R)',
    'RA18_18-11_1': 'unmatched normal control',
}

cn_clone_assignment_df_updated = cn_clone_assignment_df.reset_index()
cn_clone_assignment_df_updated['sample'] = cn_clone_assignment_df_updated['sample'].map(sample_organ_site_map)

clones_to_remove = set(cn_clone_assignment_df_updated['clone_id'].unique()) - set(clones_to_keep)
clone_swap_map = {i: 0 for i in clones_to_remove}
clone_swap_map.update({i: i for i in clones_to_keep})
cn_clone_assignment_df_updated['clone_id'] = cn_clone_assignment_df_updated['clone_id'].map(clone_swap_map) # 1. swap out the smaller clones to normal
cn_clone_assignment_df_updated['clone_id'] = cn_clone_assignment_df_updated['clone_id'].map(clone_rename_map) # 2. rename for consistency

In [56]:
cn_clone_assignment_df_updated

Unnamed: 0,sample,cell_barcode,clone_id
0,lung met (R),cell_206,4
1,lung met (R),cell_405,5
2,lung met (R),cell_663,4
3,lung met (R),cell_906,5
4,lung met (R),cell_443,1
...,...,...,...
17357,unmatched normal control,cell_742,0
17358,unmatched normal control,cell_824,0
17359,unmatched normal control,cell_885,0
17360,unmatched normal control,cell_539,0


In [57]:
fig = px.histogram(
    cn_clone_assignment_df_updated,
    x = 'sample',
    color = 'clone_id',
    color_discrete_map = cn_clone_palette,
    barnorm = 'percent',
    category_orders={"clone_id": sorted(cn_clone_palette.keys())},
    )
fig.update_layout(
    legend = {
        'title': 'cluster ID',
        'traceorder': 'normal',
    },
    xaxis = {
        'title': 'sample name',
    },
    yaxis = {
        'title': 'percent of cells in each sample',
    },
    width = 600,
    height = 600,
)
fig.update_xaxes(
    tickangle= 60, 
    tickfont=dict(
        family='Arial', 
        color='black', 
        )
    )
fig.write_image(
    '/Users/haochen/Desktop/Tapestri_analysis/Tapestri_data_batch2/analysis/copy_number/NB_EM-PS_homdel-CN_calls/RA17_22-homdel_with_normal/nclones=10/RA17_22.sample_CN-cluster_composition.updated.pdf',
    width = 600,
    height = 600,
)

# Draw pie charts

In [58]:
cn_clone_assignment_df_updated_for_pie = cn_clone_assignment_df_updated.set_index('sample')

In [11]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path


# load clone assignment df

f = '/Users/haochen/Desktop/Tapestri_analysis/Tapestri_data_batch2/copy_number/NB_EM-PS_homdel-CN_calls/TP6-homdel_with_normal_from_scratch/TP6.sample_sc_clone_assignment.updated.csv'
cn_clone_assignment_df_updated_for_pie = pd.read_csv(f, index_col=0)

In [13]:
wd = Path(f).parent / 'pie'
wd.mkdir(exist_ok=True, parents=True)

cell_assignment_dfs = {}

# case_name = f.split('/')[-1].split('-')[0]
# print(case_name)
# cn_clone_assignment_df_updated_for_pie = pd.read_csv(f, index_col=0)
unique_cluster_ids_sorted = np.sort(np.unique(cn_clone_assignment_df_updated_for_pie['clone_id']))

cn_clone_palette = dict(zip(unique_cluster_ids_sorted, np.array(px.colors.qualitative.Set3)[unique_cluster_ids_sorted]))

for sample_i in cn_clone_assignment_df_updated_for_pie.index.unique():
    fig = go.Figure(data=[go.Pie(
        labels=cn_clone_assignment_df_updated_for_pie.loc[sample_i]['clone_id'].unique().sort(), 
        values=cn_clone_assignment_df_updated_for_pie.loc[sample_i].value_counts('clone_id').sort_index(), 
        hole=.3
    )])
    colors = [cn_clone_palette[clone_i] for clone_i in cn_clone_assignment_df_updated_for_pie.loc[sample_i]['clone_id'].sort_values().unique()]
    fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                    marker=dict(colors=colors, line=dict(color='#000000', width=2)))
    fig.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        showlegend=False,
    )
    fig.write_image(str(wd /f"{sample_i}_clone_compo_pie.png"), scale=2)
