# Sample Contamination
This notebook contains developmental code for the analysis of contaminants in samples using barcode read data. Our current objective is to generate visualizations that can be used to:
* display barcode reads across samples
* identify potential contamination within samples
* display barcode reads across contaminated samples
* display relative frequency of unknown reads across samples

In [1]:
import numpy as np
import pandas as pd
from contaminant_utils import *
from path import Path

In [2]:
pd.__version__

'1.0.5'

In [3]:
sample_pths = !find /home/gk/analysis/ -name "barcode_counts" -type d
sample_pths = [Path(p)/'illumina' for p in sample_pths]

In [10]:
sample_pths[5]

Path('/home/gk/analysis/2020.06.14.hCoV19/barcode_counts/illumina')

In [4]:
data_pth = sample_pths[4]

input_pths = data_pth.listdir()
# print(input_pths)
num_samples = len(input_pths)
# print(f"There are {num_samples} total samples in this study")

In [5]:
# grab data from each sample
# %debug
ans = load_all_data(input_pths)
# generate paired reads
ans['paired_read'] = ans.apply(lambda x: x['forward_barcode'] + '-' + x['reverse_barcode'], axis=1)
# compute log of read counts 
ans['log_count'] = ans['paired_read_count'].apply(lambda x: np.log(x+1))

# print(ans.shape)

In [6]:
# generate heatmap matrix of (logged) read counts per sample per paired read
hmap = (ans.pivot_table(index=["sample"], columns=["paired_read"], values="paired_read_count")
           .replace([np.inf, -np.inf], np.nan)
           .fillna(0))


# drop unknown-unknown reads only
hmap = hmap.drop(columns='unknown-unknown')


# grab read counts 
counts = hmap.values

# normalize counts per sample (to address visualization issue) - IGNORE IF USING LOG
summed_counts = counts.sum(axis=1)[:, np.newaxis]
# normalize counts [OPTIONAL]
counts = counts / np.where(summed_counts > 0, summed_counts, 1)


# prepare data for identifying potential contaminants
flag = (ans.groupby('sample')
           .agg(uniq_forward_bcodes = ('forward_barcode', get_unique_barcodes),
                uniq_reverse_bcodes = ('reverse_barcode', get_unique_barcodes)))

# create boolean column that identifies potential contamination
flag['contamination'] = flag.apply(is_contaminant, axis=1)
# getting only samples with potential contamination, not used later but useful to look at
# contaminants = flag[flag['contamination']==True]
# merge with original data to include the contamination flags
contaminants_flag = (hmap.join(flag, how='inner')['contamination']
                         .apply(lambda x: np.where(x==True, counts.max(), counts.min())))
# add contaminant flag column to the read counts
data = np.hstack((counts, contaminants_flag[:, np.newaxis]))
# list of all sample IDs
x = hmap.index.values
# list of all paired reads and an extra flag column for contamination
y = hmap.columns.tolist() + ['contamination']

# heatmap of barcode read counts for all samples
general_hmap = generate_heatmap(data, x, y)

# choosing only samples with potential contaminants
contaminated_data = data[data[:, -1] !=0][:, :-1]
# list of samples with potential contamination only
cont_y = hmap.columns.values[~np.all(contaminated_data == 0, axis=0)].tolist()
# data (read counts) for contaminated samples only
contaminated_data = contaminated_data[:, ~np.all(contaminated_data == 0, axis=0)]
# list of paired reads with potential contamination only
cont_x = hmap.index.values[data[:, -1] !=0].tolist()
# heatmap of barcode read counts for contaminated samples only
cont_hmap = generate_heatmap(contaminated_data, cont_x, cont_y)

# def generate_table(ans, contaminated_samples):
#     return (ans.loc[(ans['sample'].isin(contaminated_samples)) & (ans['paired_read_count'] > 0)]
#                  .set_index(['sample', 'paired_read'])
#                  .sort_index()[['paired_read_count']])

cont_table = generate_table(ans, cont_x)

In [7]:
tmp = generate_html(general_hmap, cont_hmap, cont_table.to_html())

In [8]:
save_html(tmp, "test3.html")

In [42]:
cont_table

{'SEARCH-2017-SAN': 'BF3-unknown\t5.0\nunknown-BR13\t4.0\nunknown-BR20\t1.0',
 'SEARCH-2070-SAN': 'BF6-BR9\t671.0\nBF6-unknown\t10562.0\nBF7-unknown\t1.0\nunknown-BR9\t7441.0',
 'SEARCH-2086-SAN': 'BF7-unknown\t2.0\nunknown-BR11\t4.0\nunknown-BR17\t1.0',
 'SEARCH-2109-SAN': 'BF6-BR14\t1.0\nBF6-BR20\t1.0\nBF6-unknown\t38.0\nBF7-unknown\t1.0\nunknown-BR14\t27.0',
 'SEARCH-2110-SAN': 'BF7-BR14\t1717.0\nBF7-unknown\t54188.0\nunknown-BR14\t45765.0\nunknown-BR17\t1.0',
 'SEARCH-2112-SAN': 'BF1-BR15\t22.0\nBF1-unknown\t402.0\nunknown-BR14\t1.0\nunknown-BR15\t130.0',
 'SEARCH-2125-SAN': 'BF6-BR16\t296.0\nBF6-unknown\t2465.0\nBF7-unknown\t1.0\nunknown-BR16\t3408.0',
 'SEARCH-2135-SAN': 'BF7-unknown\t1.0\nBF8-BR17\t230.0\nBF8-unknown\t3671.0\nunknown-BR17\t2158.0',
 'SEARCH-2141-SAN': 'BF6-BR16\t320.0\nBF6-BR18\t7.0\nBF6-unknown\t3068.0\nunknown-BR16\t4142.0\nunknown-BR18\t72.0',
 'SEARCH-2142-SAN': 'BF6-unknown\t1.0\nBF7-BR18\t2129.0\nBF7-unknown\t23859.0\nunknown-BR18\t31074.0',
 'SEARCH-2150-

In [40]:
for x,y in tmp.items():
    print(str(x)+'\t'+str(y))

BF3-unknown	5.0
unknown-BR13	4.0
unknown-BR20	1.0


In [24]:
cont_table.split('\n')

['paired_read',
 'BF3-unknown     5.0',
 'unknown-BR13    4.0',
 'unknown-BR20    1.0',
 'Name: SEARCH-2017-SAN, dtype: float64paired_read',
 'BF6-BR9          671.0',
 'BF6-unknown    10562.0',
 'BF7-unknown        1.0',
 'unknown-BR9     7441.0',
 'Name: SEARCH-2070-SAN, dtype: float64paired_read',
 'BF7-unknown     2.0',
 'unknown-BR11    4.0',
 'unknown-BR17    1.0',
 'Name: SEARCH-2086-SAN, dtype: float64paired_read',
 'BF6-BR14         1.0',
 'BF6-BR20         1.0',
 'BF6-unknown     38.0',
 'BF7-unknown      1.0',
 'unknown-BR14    27.0',
 'Name: SEARCH-2109-SAN, dtype: float64paired_read',
 'BF7-BR14         1717.0',
 'BF7-unknown     54188.0',
 'unknown-BR14    45765.0',
 'unknown-BR17        1.0',
 'Name: SEARCH-2110-SAN, dtype: float64paired_read',
 'BF1-BR15         22.0',
 'BF1-unknown     402.0',
 'unknown-BR14      1.0',
 'unknown-BR15    130.0',
 'Name: SEARCH-2112-SAN, dtype: float64paired_read',
 'BF6-BR16         296.0',
 'BF6-unknown     2465.0',
 'BF7-unknown      

In [28]:
general_hmap.show()

In [29]:
cont_hmap.show()

In [272]:
# identify list of sample runs
# fetch list of sample barcode counts for a given run
# load dataframe containing barcode counts for the run
# generate `general_hmap`
# identify contaminants
# generate `contaminants_hmap`
# generate `contaminants_table`
# generate html string containing results
# save as html file 

In [273]:
# html_string = '''
<html>
    <head>
        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
        <style>body{ margin:0 100; background:whitesmoke; }</style>
    </head>
    <body>
        <h1>Contaminant Analysis Report</h1>

        <!-- *** Section 1 *** --->
        <h2>Section 1: General Heatmap of Barcode Read Counts</h2>
        <iframe width="1000" height="550" frameborder="0" seamless="seamless" scrolling="no" \
src="''' + general_hmap + '''.embed?width=800&height=550"></iframe>
        <p>The above figure shows a heatmap of the barcode read counts of all samples for the given run 
        (i.e. sequencing experiment). There is a "contamination column" that indicates whether or not a given 
        sample has potential contamination.</p>
        
        <!-- *** Section 2 *** --->
        <h2>Section 2: Heatmap of Barcode Read Counts of Potentially Contaminated Samples Only</h2>
        <iframe width="1000" height="1000" frameborder="0" seamless="seamless" scrolling="no" \
src="''' + contaminants_hmap + '''.embed?width=1000&height=1000"></iframe>
        <p>The above figure shows a heatmap of the read counts of samples that have been identified as 
        potentially contaminated. This means that their barcode counts contain more than one distinct 
        forward and/or reverse barcodes.</p>
        <h3>Contaminants Table: Barcode Read Counts of Samples with Potential Contamination</h3>
        ''' + contaminants_table + '''
    </body>
</html>'''

# Progress Diary
* create script to generate contaminants analysis reports, automatically
    * report in html format and includes
    * heatmap of overall read counts for a given run
    * heatmap of read counts of samples with potential contamination
    * a table of results from contaminant analysis
* integrate contaminants script into `pipeline_consensus`
    * edit `Snakefile` to include contaminant analysis as a final step
    * validate using a test run on samples provided by Karthik
    * generate contaminant analysis reports on all historical sequencing results 
* how to visualize for two or more samples? [solved]
* we want to compare contaminants accross samples [solved]
    * y-axis is samples, x-axis is barcode pairs (keep unk-unk pairs separate)
* how to flag potential for contamination? [solved]
* use PCA to visualize clusters and their similarities [TODO]
* refactor code to utilize Pandas functionality [complete]
* Making plot display intuitive
    * 1st opt: remove unk-unk [complete]
    * 2nd opt: transform logarithmically [complete]
    * 3rd opt: re-arrange axes to have main counts along the diagonal [TODO]
* create table that flags samples with more than one forward OR reverse read [complete]
* we want unk-unk to get sens of relationship between sample and frequency of unknowns [need feedback]
* look at unk-unk counts per sample on a separate plot? [need feedback]
* ct value is amt of conc of RNA in sample prior to processing (the higher the val, the lower) [TBD w/ MZ and KG]
    * assumption: sample with high RNA conc corresponds with low barcode counts
    * can be used to calibrate how many barcodes you'll find

In [274]:
sample_pths = !find /home/gk/analysis/ -name "barcode_counts" -type d
sample_pths = [Path(p)/'illumina' for p in sample_pths]

In [275]:
tmp = sample_pths[0]

In [276]:
tmp

Path('/home/gk/analysis/2020.06.28.hCoV19/barcode_counts/illumina')

In [277]:
file_pths = !find /home/gk/analysis/ -name "barcode_counts" -type d | xargs -n 1 bash -c 'ls -d -1 $0/illumina/'*

In [278]:
len(file_pths)

1840

In [279]:
# file_pths

In [5]:
def load_data(input_file: Path):
    with open(input_file, 'r') as f:
        input_data = f.readlines()
    header = re.split('\t|\n', input_data[0])[:-1]
    data = []
    for row in input_data[1:]:
        data.append(re.split('\t|\n', row)[:-1])
    df = pd.DataFrame(data=data, columns=header)
    return df, header

def prepare_data(df: pd.DataFrame, header: list):
    # convert read counts to integer
    df[header[-1]] = df[header[-1]].astype(int)
    # consolidate barcodes and their reverse complements 
    df = df.apply(_consolidate_reverse_complements, axis=1)
    # merge barcodes and their rcs and sum their read counts
    df = (df.groupby(['forward_barcode', 'reverse_barcode'])
            .agg({'paired_read_count': 'sum'})
            .reset_index())
    # create df with all possible paired combos of forward and reverse barcodes 
    forward_bcodes = df[['forward_barcode']].drop_duplicates()
    reverse_bcodes = df[['reverse_barcode']].drop_duplicates()
    forward_bcodes['key'] = 0
    reverse_bcodes['key'] = 0
    all_pairs = forward_bcodes.merge(reverse_bcodes, how='outer', on='key').drop(columns='key')
    all_pairs = (all_pairs.merge(df, on=['forward_barcode', 'reverse_barcode'], how='left')
                          .fillna(0)
                          .set_index(['forward_barcode', 'reverse_barcode']))
    return all_pairs

def _consolidate_reverse_complements(x: str):
    x['forward_barcode'] = x['forward_barcode'].split('_')[0]
    x['reverse_barcode'] = x['reverse_barcode'].split('_')[0]
    return x

In [281]:
data_pth = sample_pths[4]

input_pths = data_pth.listdir()
# print(input_pths)
num_samples = len(input_pths)
print(f"There are {num_samples} total samples in this study")

There are 278 total samples in this study


In [282]:
# TEST
df = prepare_data(*load_data(input_pths[4]))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,paired_read_count
forward_barcode,reverse_barcode,Unnamed: 2_level_1
BF7,unknown,22.0
BF7,BR15,0.0
unknown,unknown,224.0
unknown,BR15,14.0


In [6]:
def load_all_data(input_pths: list):
    """Load barcode read data from the given list of sample file paths"""
    ans = pd.DataFrame()
    for sample_pth in input_pths:
        try:
            df = prepare_data(*load_data(sample_pth))
        except: continue
        df = (df.reset_index()#[['forward_barcode', 'reverse_barcode']]
                .drop_duplicates())
        df['sample'] = sample_pth.basename().split('_')[0]
        ans = pd.concat([ans, df], axis=0)
    return ans

In [284]:
# grab data from each sample
# %debug
ans = load_all_data(input_pths)
# generate paired reads
ans['paired_read'] = ans.apply(lambda x: x['forward_barcode'] + '-' + x['reverse_barcode'], axis=1)
# compute log of read counts 
ans['log_count'] = ans['paired_read_count'].apply(lambda x: np.log(x+1))

print(ans.shape)
ans.sort_values('sample').head(8)

(771, 6)


Unnamed: 0,forward_barcode,reverse_barcode,paired_read_count,sample,paired_read,log_count
0,BF2,BR9,1.0,SEARCH-1993-SAN,BF2-BR9,0.693147
1,BF2,unknown,12.0,SEARCH-1993-SAN,BF2-unknown,2.564949
2,unknown,BR9,5.0,SEARCH-1993-SAN,unknown-BR9,1.791759
3,unknown,unknown,92.0,SEARCH-1993-SAN,unknown-unknown,4.532599
0,BF3,BR9,8.0,SEARCH-1994-SAN,BF3-BR9,2.197225
1,BF3,unknown,156.0,SEARCH-1994-SAN,BF3-unknown,5.056246
2,unknown,BR9,103.0,SEARCH-1994-SAN,unknown-BR9,4.644391
3,unknown,unknown,1112.0,SEARCH-1994-SAN,unknown-unknown,7.014814


In [285]:
# ans

In [286]:
# generate heatmap matrix of (logged) read counts per sample per paired read
hmap = (ans.pivot_table(index=["sample"], columns=["paired_read"], values="paired_read_count")
           .replace([np.inf, -np.inf], np.nan)
           .fillna(0))

In [287]:
# drop all reads with any unknowns
# hmap = hmap.drop(columns=[col for col in ans.columns if 'unknown' in col.split('-')])

In [288]:
# drop unknown-unknown reads only
hmap = hmap.drop(columns='unknown-unknown')

In [289]:
hmap.head()

paired_read,BF1-BR10,BF1-BR11,BF1-BR12,BF1-BR13,BF1-BR14,BF1-BR15,BF1-BR16,BF1-BR17,BF1-BR18,BF1-BR19,...,unknown-BR12,unknown-BR13,unknown-BR14,unknown-BR15,unknown-BR16,unknown-BR17,unknown-BR18,unknown-BR19,unknown-BR20,unknown-BR9
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SEARCH-1993-SAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
SEARCH-1994-SAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,103.0
SEARCH-1995-SAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,410.0
SEARCH-1996-SAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
SEARCH-1997-SAN,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [290]:
# cont_y

In [291]:
# print(str(hmap.loc[y, hmap.loc[y]!=0]))

In [292]:
counts = hmap.values

# normalize counts per sample (to address visualization issue) - IGNORE IF USING LOG
summed_counts = counts.sum(axis=1)[:, np.newaxis]
# normalize counts [OPTIONAL]
counts = counts / np.where(summed_counts > 0, summed_counts, 1)

### creating table to identify potential contamination
**ASSUMPTION: a given sample has only one forward and one reverse barcode, at most**

In [294]:
def get_unique_barcodes(x):
    x = set(x.unique())
    x.discard('unknown')
    return x


def is_contaminant(x):
    if len(x['uniq_forward_bcodes']) > 1 or len(x['uniq_reverse_bcodes']) > 1:
        return True
    return False

flag = (ans.groupby('sample')
           .agg(uniq_forward_bcodes = ('forward_barcode', get_unique_barcodes),
                uniq_reverse_bcodes = ('reverse_barcode', get_unique_barcodes)))

# create boolean column that identifies potential contamination
flag['contamination'] = flag.apply(is_contaminant, axis=1)
# getting only samples with potential contamination, not used but useful to look at
contaminants = flag[flag['contamination']==True]
# ans = ans.merge(flag, on='sample', how='left')

In [295]:
# TEST - look at samples with potential contamination
# ans.loc[ans['contamination']==True]

In [296]:
# merge with original data to include the contamination flags
contaminants_flag = (hmap.join(flag, how='inner')['contamination']
                         .apply(lambda x: np.where(x==True, counts.max(), counts.min())))

In [297]:
# TEST
flag.head()

Unnamed: 0_level_0,uniq_forward_bcodes,uniq_reverse_bcodes,contamination
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SEARCH-1993-SAN,{BF2},{BR9},False
SEARCH-1994-SAN,{BF3},{BR9},False
SEARCH-1995-SAN,{BF5},{BR9},False
SEARCH-1996-SAN,{BF6},{BR9},False
SEARCH-1997-SAN,{BF1},{BR10},False


### gathering data to generate heatmaps

In [298]:
data = np.hstack((counts, contaminants_flag[:, np.newaxis]))

In [299]:
data.shape, counts.shape, contaminants_flag.shape

((215, 101), (215, 100), (215,))

In [300]:
y = hmap.index.values
x = hmap.columns.tolist() + ['contamination']

In [323]:
len(y), len(x), data.shape

(215, 101, (215, 101))

## Heat Map of Paired Read Counts across all Samples

In [322]:
import plotly.graph_objs as go

heatmap = go.Heatmap(z=data.T, x=y, y=x)
plot = [heatmap]
fig = go.Figure(data = plot)
fig.show()

In [303]:
# import dash
# import dash_core_components as dcc
# import dash_html_components as html

In [304]:
# app = dash.Dash()
# app.layout = html.Div([
#     dcc.Graph(figure=fig)
# ])

# app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter

In [326]:
contaminated_data.shape

(16, 30)

## Heat Map of Paired Read Counts across Samples with Potential Contamination

In [311]:
# choosing only samples with potential contaminants
contaminated_data = data[data[:, -1] !=0][:, :-1]
# list of samples with potential contamination only
cont_x = hmap.columns.values[~np.all(contaminated_data == 0, axis=0)].tolist()
contaminated_data = contaminated_data[:, ~np.all(contaminated_data == 0, axis=0)]

cont_y = hmap.index.values[data[:, -1] !=0].tolist()
# cont_y

In [315]:
len(cont_x), len(cont_y)

(30, 16)

In [316]:
contaminated_data.shape

(16, 30)

In [245]:
# TEST
output = ''
for y in cont_y:
    output += str(hmap.loc[y, hmap.loc[y]!=0])

In [246]:
sample_name = data_pth.split('/')[4]

In [317]:
with open(f"{sample_name}_contaminants.txt", 'w') as f:
    f.write(output)

In [318]:
# print(output)

In [319]:
# cont_y

In [321]:
# import plotly.plotly as py
import plotly.graph_objs as go

heatmap = go.Heatmap(z=contaminated_data.T, x=cont_y, y=cont_x)
plot = [heatmap]
fig = go.Figure(data = plot)
fig.show()

## Visualizing Unk-Unk reads

In [25]:
unk_counts = (ans.pivot_table(index=["sample"], columns=["paired_read"], values="log_count")
                 .replace([np.inf, -np.inf], np.nan)
                 .fillna(0)
                 #.astype(int)
                 .apply(lambda x: np.where(x.sum() >= 1, x['unknown-unknown'] / x.sum(), 0), axis=1))

In [26]:
unk_counts.head()

sample
1729    0.000000
1731    0.666667
1732    0.326862
1733    0.368087
1736    0.375789
dtype: float64

In [27]:
y = unk_counts.index.values
x = unk_counts.values

In [28]:
# TEST
x[:, np.newaxis].shape

(236, 1)

In [None]:
#TODO: sort by count, variable height of plot
#TODO: look at overall number of barcode reads accross samples
#TODO: make this more formal, more easy to read
# this is used to understand level of spiking required
# run coming next week, would be good to analyze contamination beforehand
#IDEA: run historical runs to identify contaminants

In [29]:
fig = go.Figure(go.Bar(
            x=x,
            y=y,
            orientation='h'))

fig.show()

In [30]:
fig = px.imshow(x[:, np.newaxis],
                labels=dict(x="Paired Read", y="Sample ID", color="Count"),
                x=['unknown-unknown'],
                y=y.tolist()
               )
fig.update_xaxes(side="top")

# TODO
* refactor code to utilize Pandas functionality [complete]
* 1st opt: remove unk-unk [complete]
* 2nd opt: transform logarithmically [complete]
* create table that flags samples with more than one forward OR reverse read [complete]
* we want unk-unk to get sens of relationship between sample and frequency of unknowns [need feedback]
* look at unk-unk counts per sample on a separate plot? [need feedback]
* ct value is amt of conc of RNA in sample prior to processing (the higher the val, the lower) [TBD]
    * assumption: sample with high RNA conc corresponds with low barcode counts
    * can be used to calibrate how many barcodes you'll find
* go to analysis folder in KGs
* get comfortable using bash to look at different runs 
* use this to build an automated script 
* outputs
    * csv with contaminant flagging 
    * html report with plots 

In [35]:
def get_paired_reads(df: pd.DataFrame):
    uniq_paired_reads = df[['forward_barcode', 'reverse_barcode']].drop_duplicates().values
    read2idx = {}
    idx2read = {}
    sample2idx = {}
    idx2sample = {}
    for i, paired_read in enumerate(uniq_paired_reads):
        read = get_read(paired_read)
        read2idx[read] = i
        idx2read[i] = read
    return read2idx, idx2read


def get_read(paired_read: np.array):
    return (paired_read[0], paired_read[1])

In [36]:
read2idx, idx2read = get_paired_reads(ans)

In [37]:
def get_samples(df: pd.DataFrame):

    uniq_samples = df[['sample']].drop_duplicates().values

    sample2idx = {}
    idx2sample = {}

    for i, sample in enumerate(uniq_samples):
        sample2idx[sample[0]] = i
        idx2sample[i] = sample
    return sample2idx, idx2sample

In [38]:
sample2idx, idx2sample = get_samples(ans)

In [39]:
len(read2idx), len(sample2idx)

(105, 236)

In [18]:
fig = ff.create_(counts, x=x, y=y, colorscale='Viridis', showscale=True)

In [19]:
fig.write_html("test_hmap.html")

In [24]:
# %debug
# fig = generate_heatmap(input_pths[0])

In [4]:
with open(input_pths[0], 'r') as f:
    tmp_input = f.readlines()
print(tmp_input)

['forward_barcode\treverse_barcode\tpaired_read_count\n', 'BF1\tBR15\t595\n', 'BF1_rc\tunknown\t3991\n', 'BF1\tunknown\t16972\n', 'unknown\tBR15\t7229\n', 'unknown\tBR15_rc\t3615\n', 'unknown\tunknown\t149989\n']


In [5]:
header = re.split('\t|\n', tmp_input[0])[:-1]
data = []
for input_element in tmp_input[1:]:
    data.append(re.split('\t|\n', input_element)[:-1])

In [6]:
print(header)
print(data)

['forward_barcode', 'reverse_barcode', 'paired_read_count']
[['BF1', 'BR15', '595'], ['BF1_rc', 'unknown', '3991'], ['BF1', 'unknown', '16972'], ['unknown', 'BR15', '7229'], ['unknown', 'BR15_rc', '3615'], ['unknown', 'unknown', '149989']]


In [7]:
df = pd.DataFrame(data=data, columns=header)
df['paired_read_count'] = df['paired_read_count'].astype(int)
df.head(6)

Unnamed: 0,forward_barcode,reverse_barcode,paired_read_count
0,BF1,BR15,595
1,BF1_rc,unknown,3991
2,BF1,unknown,16972
3,unknown,BR15,7229
4,unknown,BR15_rc,3615
5,unknown,unknown,149989


In [8]:
def _consolidate_reverse_complements(x: str):
    x['forward_barcode'] = x['forward_barcode'].split('_')[0]
    x['reverse_barcode'] = x['reverse_barcode'].split('_')[0]
    return x

In [9]:
df = df.apply(_consolidate_reverse_complements, axis=1)
print(df)

  forward_barcode reverse_barcode  paired_read_count
0             BF1            BR15                595
1             BF1         unknown               3991
2             BF1         unknown              16972
3         unknown            BR15               7229
4         unknown            BR15               3615
5         unknown         unknown             149989


In [10]:
df = (df.groupby(['forward_barcode', 'reverse_barcode'])
        .agg({'paired_read_count': 'sum'})
        .reset_index())
print(df)

  forward_barcode reverse_barcode  paired_read_count
0             BF1            BR15                595
1             BF1         unknown              20963
2         unknown            BR15              10844
3         unknown         unknown             149989


In [86]:
for row in df.values:
    print(row)

['BF1' 'BR15' 595]
['BF1' 'unknown' 20963]
['unknown' 'BR15' 10844]
['unknown' 'unknown' 149989]


In [26]:
forward_bcodes = df[['forward_barcode']].drop_duplicates()
reverse_bcodes = df[['reverse_barcode']].drop_duplicates()
forward_bcodes['key'] = 0
reverse_bcodes['key'] = 0
all_pairs = forward_bcodes.merge(reverse_bcodes, how='outer', on='key').drop(columns='key')
all_pairs = all_pairs.merge(df, on=['forward_barcode', 'reverse_barcode'], how='left').fillna(0)
# all_pairs

In [29]:
all_pairs = all_pairs.set_index(['forward_barcode', 'reverse_barcode'])

In [57]:
# getting count values in the right shape (dimensions)
m,n = len(all_pairs.index.levels[0]), len(all_pairs.index.levels[1])
counts = all_pairs.values.reshape(m,n)

In [59]:
counts

array([[   595,  20963],
       [ 10844, 149989]])

In [60]:
x = all_pairs.reset_index()['forward_barcode'].unique()
y = all_pairs.reset_index()['reverse_barcode'].unique()
z = counts

In [61]:
z

array([[   595,  20963],
       [ 10844, 149989]])

In [62]:
len(x)

2

In [63]:
len(z[0])

2

In [66]:
x.tolist()

['BF1', 'unknown']

In [69]:
fig = ff.create_annotated_heatmap(z, x=x.tolist(), y=y.tolist(), colorscale='Viridis', showscale=True)
fig.show()