In [2]:
import os
import glob
import numpy as np
import pandas as pd

# Report summary

In [3]:
dat = pd.read_excel("data/StrandSeq.xlsx")

In [4]:
# total reads

total_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    total_reads = 0
    path = "results/mapping/filtered/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            total_reads = int(line.split()[0])
            break
    total_reads_list.append(total_reads)
dat["TotalReads"] = total_reads_list

In [5]:
# uniq reads

uniq_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    uniq_reads = 0
    path = "results/mapping/mark_duplicate/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            if "primary mapped" in line:
                uniq_reads = int(line.split()[0])
                break
    uniq_reads_list.append(uniq_reads)
dat["UniqReads"] = uniq_reads_list

In [6]:
# reads

total_read_list = []
uniq_read_list = []
for run, cell in dat[["Run", "Cell"]].values:
    total_read = np.nan
    uniq_read = np.nan
    path = "results/mapping/mark_duplicate/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            if line[:-1].endswith("primary"):
                total_read = int(line.split()[0])
            if line[:-1].endswith("primary duplicates"):
                uniq_read = total_read - int(line.split()[0])
    total_read_list.append(total_read)
    uniq_read_list.append(uniq_read)
dat["TotalReads"] = total_read_list
dat["UniqReads"] = uniq_read_list

In [7]:
# background

background_list = []
for run, cell in dat[["Run", "Cell"]].values:
    background = np.nan
    path = "results/stat/background/%s/%s_summary.tsv" % (run, cell)
    if os.path.exists(path):
        tmp = pd.read_csv(path, sep="\t")
        background = tmp["Background"].values[0]
    background_list.append(background)
dat["Background"] = background_list

In [8]:
# GC content
mean_list = []
median_list = []
for run, cell in dat[["Run", "Cell"]].values:
    mean = np.nan
    median = np.nan
    path = "results/stat/gc_content/%s/%s_summary.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        mean = df["GC.Mean"].values[0]
        median = df["GC.Median"].values[0]
    mean_list.append(mean)
    median_list.append(median)
dat["GC.Mean"] = mean_list
dat["GC.Median"] = median_list

In [9]:
# spikiness

values = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/spikiness/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        v = df["Spikiness"].values[0]
    values.append(v)
dat["Spikiness"] = values

In [10]:
# genomic depth

values = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/depth/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        v = df[df["Name"] == "Overall"]["Depth"].values[0]
    values.append(v)
dat["GenomicDepth"] = values

In [11]:
# genomic coverage

values = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/coverage/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        v = df[df["Name"] == "Overall"]["Ratio"].values[0]
    values.append(v)
dat["GenomicCoverage"] = values

In [12]:
! mkdir -p reports
dat.to_csv("reports/StrandSeq_Summary.csv", index=False)