In [4]:
import os
import numpy as np
import pandas as pd

# Report summary

In [54]:
dat = pd.read_excel("data/NanoStrandSeq.xls")

In [55]:
# reads

total_read_list = []
uniq_read_list = []
for run, cell in dat[["Run", "Cell"]].values:
    total_read = np.nan
    uniq_read = np.nan
    path = "results/mapping/mark_duplicate/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            if line[:-1].endswith("primary"):
                total_read = int(line.split()[0])
            if line[:-1].endswith("primary duplicates"):
                uniq_read = total_read - int(line.split()[0])
    total_read_list.append(total_read)
    uniq_read_list.append(uniq_read)
dat["TotalReads"] = total_read_list
dat["UniqReads"] = uniq_read_list

In [56]:
# Length of fragment

length_median_list = []
length_mean_list = []
length_std_list = []
for run, cell in dat[["Run", "Cell"]].values:
    length_median = np.nan
    length_mean = np.nan
    length_std = np.nan
    path = "results/stat/lengths/%s/%s_summary.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        length_median = d["Length.Median"].values[0]
        length_mean = d["Length.Mean"].values[0]
        length_std = d["Length.Std"].values[0]
    length_median_list.append(length_median)
    length_mean_list.append(length_mean)
    length_std_list.append(length_std)
dat["Length.Median"] = length_median_list
dat["Length.Mean"] = length_mean_list
dat["Length.Std"] = length_std_list

In [57]:
# Background

vs = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/background/%s/%s_summary.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        v = d["Background"].values[0]
    vs.append(v)
dat["Background"] = vs

In [58]:
# GC content

gc_median_list = []
gc_mean_list = []
gc_std_list = []
for run, cell in dat[["Run", "Cell"]].values:
    gc_median = np.nan
    gc_mean = np.nan
    gc_std = np.nan
    path = "results/stat/gc_content/%s/%s_summary.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        gc_median = d["GC.Median"].values[0]
        gc_mean = d["GC.Mean"].values[0]
        gc_std = d["GC.Std"].values[0]
    gc_median_list.append(gc_median)
    gc_mean_list.append(gc_mean)
    gc_std_list.append(gc_std)
dat["GC.Median"] = gc_median_list
dat["GC.Mean"] = gc_mean_list
dat["GC.Std"] = gc_std_list

In [59]:
# Spikiness

vs = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/spikiness/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        v = pd.read_csv(path, sep="\t")["Spikiness"].values[0]
    vs.append(v)
dat["Spikiness"] = vs

In [60]:
# Depth

vs = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/depth/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        v = df[df["Name"] == "Overall"]["Depth"].values[0]
    vs.append(v)
dat["GenomicDepth"] = vs

In [61]:
# Genomic coverage

vs = []
for run, cell in dat[["Run", "Cell"]].values:
    v = np.nan
    path = "results/stat/coverage/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        v = df[df["Name"] == "Overall"]["Ratio"].values[0]
    vs.append(v)
dat["GenomicCoverage"] = vs

In [62]:
dat.to_excel("data/NanoStrandSeq_Summary.xlsx", index=False)

# Quality control of cells (Cell QC)

## 1. HG001

In [28]:
d = pd.read_excel("data/NanoStrandSeq_Summary_20240301.xlsx")

runs= [
    "20220418_S5CSTreat",
    "20220430_S5Merged", 
    "20220610_Treat", 
    "20220708_GM12878", 
    "20220708_GM12878R2", 
    "20220817_HG001", 
    "20220915_P1", 
    "20220916_P1", 
    "20220916_P2"]

control_cells = [
    "20220418_S5CSTreat.sc001",
    "20220418_S5CSTreat.sc002",
    "20220418_S5CSTreat.sc003",
    "20220418_S5CSTreat.sc004",
    "20220418_S5CSTreat.sc005",
    "20220418_S5CSTreat.sc006",
    "20220418_S5CSTreat.sc007",
    "20220418_S5CSTreat.sc008"]

d = d[[run in runs for run in d["Run"]]]
d = d[[cell not in control_cells for cell in d["Cell"]]]
d = d[(d["UniqReads"] >= 80000) & (d["Background"] < 0.05)]
d.to_excel("data/NanoStrandSeq_HG001_QC_Pass.xlsx", index=False)
print("Passed QC cells:", len(d))

Passed QC cells: 364


## 2. C57/DBA

In [29]:
d = pd.read_excel("data/NanoStrandSeq_Summary_20240301.xlsx")

runs= [
    "20220524_MEF", 
    "20220929_P1", 
    "20220929_P2", 
    "20221005_MEF", 
    "20221006_MEF"]

d = d[[run in runs for run in d["Run"]]]
d = d[(d["UniqReads"] >= 70000) & (d["Background"] < 0.05)]
d.to_excel("data/NanoStrandSeq_C57DBA_QC_Pass.xlsx", index=False)
print("Passed QC cells:", len(d))

Passed QC cells: 206
