In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import sys
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import io

# SAMTOOLS COVERAGE

In [2]:
dpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/reports/coverage/"

names = {
    '#rname' : 'Chromosome',
    'startpos' : 'Start',
    'endpos' : 'End',
    'numreads' : 'Reads',
    'covbases' : 'Bases Covered',
    'coverage' : 'Coverage (%)',
    'meandepth' : 'Mean Depth', 
    'meanbaseq' : 'Mean Base Quality',
    'meanmapq' : 'Mean Mapping Quality',
}

df = []

for f in os.listdir(dpath):
    fpath = f"{dpath}{f}"
    sample = f[:2]
    cell = f[2:5]
    ref = f.split(".")[1]
    align_method = f.split(".")[2]\

    tmp = pd.read_csv(fpath, 
                      sep='\t', 
                     )
    tmp = tmp.rename(columns=names)
    tmp['Sample'] = sample
    tmp['Barcode'] = cell
    tmp['Reference'] = ref
    tmp['Align Method'] = align_method
    df.append(tmp)

df = pd.concat(df)

# remove unplaced contigs
chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', 'X']

df = df[df['Chromosome'].isin(chroms)]

pdf = pd.pivot_table(df, index=['Sample', 
                               'Barcode', 
                               'Reference', 
                               'Chromosome', 
                               'Start', 
                               'End'],
                   columns='Align Method').reset_index()

df.head()

Unnamed: 0,Chromosome,Start,End,Reads,Bases Covered,Coverage (%),Mean Depth,Mean Base Quality,Mean Mapping Quality,Sample,Barcode,Reference,Align Method
0,1,1,202555645,91754,234512,0.115777,0.122412,17.6,49.0,o1,b17,129S1_SvImJ,raw
1,2,1,188650360,100545,228749,0.121256,0.112919,18.7,46.6,o1,b17,129S1_SvImJ,raw
2,3,1,164074439,72576,192909,0.117574,0.099833,18.0,44.8,o1,b17,129S1_SvImJ,raw
3,4,1,160637212,62689,181534,0.113009,0.091981,18.7,47.7,o1,b17,129S1_SvImJ,raw
4,5,1,157702868,101229,204053,0.129391,0.156047,17.6,47.5,o1,b17,129S1_SvImJ,raw


In [5]:
outpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/analysis/excel_reports/coverage_summary.xlsx"
with pd.ExcelWriter(outpath, engine='xlsxwriter') as writer:

    # save each sample to a new tab
    for sample_names, group in pdf.groupby(['Sample', 'Reference']):
        sample, ref = sample_names
        group.to_excel(writer, sheet_name=f"Sample_{sample}_ref_{ref}", index=True)

print(f'saved: {outpath}')

saved: /scratch/indikar_root/indikar1/cstansbu/scpc_test/analysis/excel_reports/coverage_summary.xlsx


In [4]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Histograms

In [None]:
df.columns

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

for c in plot_cols:
    sns.histplot(data=df, 
                 x=c,
                 bins=31,
                 log_scale=True)

    plt.title(c)
    sns.despine()
    plt.show()

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

for c in plot_cols:
    sns.histplot(data=df, 
                 x=c,
                 bins=31,
                 hue='Align Method',
                 log_scale=True,)
                 # kde=True)
    
    plt.title(c)
    sns.despine()
    plt.show()

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

for c in plot_cols:
    sns.histplot(data=df, 
                 x=c,
                 bins=31,
                 hue='Sample',
                 log_scale=True,)

    plt.title(c)
    sns.despine()
    plt.show()

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

for c in plot_cols:
    sns.histplot(data=df, 
                 x=c,
                 bins=31,
                 hue='Reference',
                 log_scale=True)

    plt.title(c)
    sns.despine()
    plt.show()

# by chrom

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 7, 2.5

for c in plot_cols:
    sns.boxplot(data=df,
                x='Chromosome',
                y=c,
                hue='Align Method',
                showfliers=False,)

    plt.title(c)
    sns.despine()
    plt.show()

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 7, 2.5

for c in plot_cols:
    sns.boxplot(data=df,
                x='Chromosome',
                y=c,
                hue='Reference',
                showfliers=False,)

    plt.title(c)
    sns.despine()
    plt.show()

In [None]:
plot_cols = [
    'Bases Covered', 
    'Coverage (%)',
    'Mean Depth', 
    'Mean Base Quality', 
    'Mean Mapping Quality', 
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 7, 2.5

for c in plot_cols:
    sns.boxplot(data=df,
                x='Chromosome',
                y=c,
                hue='Sample',
                showfliers=False,)

    plt.title(c)
    sns.despine()
    plt.show()