# DBS-Pro Analysis Report

## Dataprocessing
### Load data

In [None]:
%matplotlib inline
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Data import 
df = pd.read_csv("data.tsv", sep="\t")
df.head()

### Filter data

In [None]:
# Filter for read count
readcount_thres=1
df_filt = df[df["ReadCount"] > readcount_thres]

# Filter on umi count if requested.
umicount_thres=0
if umicount_thres > 0:
    df_filt = df_filt.groupby("Barcode", as_index=False).filter(lambda x: x["UMI"].count() > umicount_thres)

### Merge data

In [None]:
#Merge to long format
df_counts = df_filt.groupby(["Barcode","Target"], as_index=False)["UMI"].count()
df_counts.head()

In [None]:
#Get count matrix
count_matrix = df_counts.set_index("Barcode").pivot(columns="Target", values="UMI").fillna(0)
count_matrix.head()

## Results

### Stats

In [None]:
print(f"Total DBS-ABC-UMI combos, {len(df_filt)}")
print(f"Total DBS count, {len(df_filt.groupby('Barcode'))}")

### UMI counts

In [None]:
ax = df_counts.groupby("Barcode", as_index=False)["UMI"].sum().sort_values(by="UMI", ascending=False).reset_index().plot(y="UMI")
ax.set_ylabel("UMIs per barcode")
ax.set_xlabel("Barcode rank")

### Box plots

In [None]:
g = sns.boxplot(data=df_counts, x="Target", y="UMI")
g.set(ylabel="UMI count", title="UMI count")

### Paired plots

In [None]:
def pair_heatmap(df, title=None):
    # Based of https://stackoverflow.com/questions/43924280/pair-plot-with-heat-maps-possibly-logarithmic
    from matplotlib.colors import LogNorm
    sns.set(style="white")
    g = sns.PairGrid(df)
    g.map_diag(plt.hist, bins=20)
    
    # Set title
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle(title)
    
    def pairgrid_heatmap(x, y, **kws):
        cmap = sns.light_palette(kws.pop("color"), as_cmap=True)
        plt.hist2d(x, y, cmap=cmap, cmin=1, **kws)

    g.map_offdiag(pairgrid_heatmap, bins=20, norm=LogNorm())

In [None]:
pair_heatmap(count_matrix, title="UMI count")