In [20]:
import os
import numpy as np
import pandas as pd

In [16]:
discarded_runs = [
    # R9.4 and test library structure
    "20220130_Cell8",
    "20220316_Cell40",
    "20220413_Cell26",
    "20220506_Cell25",
    "20220520_Cell16",
    "20220525_Run1",
    "20220525_Run2",
    "20220601_Cell22",
    "20220607_Cell40",
    "20220615_Cell68",

    # test library structure
    "20220621_R10Test",

    # without blastocyst cell
    "20220719_Embryo",

    # R9.4
    "20220729_Embryo",

    # bad chip
    "20221019_Blastocyst",

    # human blastocyst
    "20220912_HumanBlastR1",
    "20220912_HumanBlastR2",

    # others
    "20220818_mESCR3",
    "20220818_mESCR3M",
    "20220902_Blastocyst",
    "20220902_BlastocystM",
    "20220903_Blastocyst",
    "20220903_BlastocystM"
]

# Make data/NanoNASCseq.xlsx

In [17]:
d = pd.read_excel("data/NanoNASCseq_All.xlsx")
print("Input cells:", len(d))
for run in discarded_runs:
    d = d[d["Run"] != run]
d.to_excel("data/NanoNASCseq.xlsx", index=False)
print("Output cells:", len(d))

Input cells: 5569
Output cells: 4555


# Make data/NanoNASCseq_All_Summary.xlsx

In [18]:
def divide(a, b):
    return None if (b is None or b == 0) else a / b
dat = pd.read_excel("data/NanoNASCseq_All.xlsx")

In [21]:
# Cell reads and trimmed reads
cell_reads_list = []
trim_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    cell_reads = np.nan
    trim_reads = np.nan
    path = "results/demux/trimmed/%s/%s/stats.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        cell_reads = df["Total"].values[0]
        trim_reads = df["Pass"].values[0]
    cell_reads_list.append(cell_reads)
    trim_reads_list.append(trim_reads)
dat["Cell.Reads"] = cell_reads_list
dat["Trimmed.Reads"] = trim_reads_list
dat["Trimmed.Ratio"] = dat["Trimmed.Reads"] / dat["Cell.Reads"]

In [22]:
# Mapped reads
mapped_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    mapped_reads = np.nan
    path = "results/mapping/minimap2/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        mapped_reads = int(open(path).readlines()[7].split()[0])
    mapped_reads_list.append(mapped_reads)
dat["Mapped.Reads"] = mapped_reads_list
dat["Mapped.Ratio"] = dat["Mapped.Reads"] / dat["Trimmed.Reads"]

In [23]:
# Mitochondrion reads ratio
mito_ratio_list = []
for run, cell in dat[["Run", "Cell"]].values:
    mito_ratio = np.nan
    path = "results/mapping/chrom_reads/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        try:
            v1, v2 = 0, 0
            for line in open(path):
                chrom, count = line.strip("\n").split("\t")
                count = int(count)
                v1 += count
                if chrom == "chrM":
                    v2 = count
            mito_ratio = divide(v2, v1)
        except ValueError:
            print(path)
    mito_ratio_list.append(mito_ratio)
dat["Mito.Ratio"] = mito_ratio_list

In [24]:
# Filtered reads
filtered_reads_list = []
filtered_clip_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    filtered_reads = np.nan
    filtered_clip_reads = np.nan
    path = "results/mapping/stat_clip/%s/%s.log" % (run, cell)
    if os.path.exists(path):
        lines = open(path).readlines()
        for i, line in enumerate(lines):
            if line.startswith("Input"):
                v1, v2 = lines[i + 1].split("\t")[:2]
                filtered_reads = int(v1)
                filtered_clip_reads = int(v2)
                break
    filtered_reads_list.append(filtered_reads)
    filtered_clip_reads_list.append(filtered_clip_reads)
dat["Filtered.Reads"] = filtered_reads_list
dat["Filtered.Ratio"] = dat["Filtered.Reads"] / dat["Mapped.Reads"]
dat["FilteredClip.Reads"] = filtered_clip_reads_list
dat["FilteredClip.Ratio"] = dat["FilteredClip.Reads"] / dat["Filtered.Reads"]

In [25]:
# Duplicate reads
umis1_list = []
umis2_list = []
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/mapping/mark_duplicate/%s/%s.tsv" % (run, cell)
    umis1 = np.nan
    umis2 = np.nan
    if os.path.exists(path):
        d1 = pd.read_csv(path, sep="\t", header=0)
        umis1 = len(d1)
        umis2 = len(d1[d1["AllSize"] >= 2])
    umis1_list.append(umis1)
    umis2_list.append(umis2)
dat["UMIs"] = umis1_list
dat["UMIs.2Reads"] = umis2_list
dat["Duplicate.Reads"] = dat["FilteredClip.Reads"] - dat["UMIs"]
dat["Duplicate.Ratio"] = dat["Duplicate.Reads"] / dat["FilteredClip.Reads"]
dat["Unique.Reads"] = dat["UMIs"]

In [26]:
# Detected genes
genes_list = []
for run, cell in dat[["Run", "Cell"]].values:
    genes = np.nan
    path = "results/expression/quant_genes/min_read_1_min_tc_1/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        d = d[d["Total"] > 0]
        genes = len(set(filter(lambda x: "_" not in x, d.index)))
    genes_list.append(genes)
dat["Genes"] = genes_list

In [27]:
# Detected isoforms
sc_list = set(['full-splice_match', 'incomplete-splice_match', 'novel_in_catalog', 'novel_not_in_catalog'])
isoforms1_list = []
isoforms2_list = []
for run, cell in dat[["Run", "Cell"]].values:
    isoforms1 = np.nan # number of assemblied isoforms
    isoforms2 = np.nan # number of known isoforms
    path = "results/assembly/sqanti3/%s/%s/%s_classification.txt" % (run, cell, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        isoforms1 = len(d)
        d = d[[sc in sc_list for sc in d["structural_category"]]]
        isoforms2 = len(d)
    isoforms1_list.append(isoforms1)
    isoforms2_list.append(isoforms2)
dat["Isoforms.Assembled"] = isoforms1_list
dat["Isoforms.Known"] = isoforms2_list

In [28]:
# Mismatch ratios
mtypes = []
for base1 in "ACGT":
    for base2 in "ACGT":
        if base1 != base2:
            mtypes.append("%s%s" % (base1, base2))
ratios = dict()
for mtype in mtypes:
    ratios[mtype] = list()
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/mismatch/ratio_consensus/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t", index_col=0)
        for mtype in mtypes:
            ratios[mtype].append(df.loc[mtype]["Ratio"])
    else:
        for mtype in mtypes:
            ratios[mtype].append(np.nan)
for mtype in mtypes:
    dat["%s.Ratio" % mtype] = ratios[mtype]

In [29]:
# Pe and Pc
pe_list = []
pc_list = []
for run, cell in dat[["Run", "Cell"]].values:
    pe = np.nan
    pc = np.nan
    path = "results/signal2noise/pc/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        pe, pc, pc_pe = d.iloc[0]
    pe_list.append(pe)    
    pc_list.append(pc)
dat["Pe"] = pe_list
dat["Pc"] = pc_list
dat["PcPe.Ratio"] = dat["Pc"] / dat["Pe"]

In [30]:
# Nascent UMIs
nascent_umis_list = []
for run, cell in dat[["Run", "Cell"]].values:
    nascent_umis = np.nan
    path = "results/mismatch/ratio_consensus/%s/%s.events.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", index_col=0)
        d = d[(d["Size"] >= 2) & (d["T-C"] >= 2)]
        nascent_umis = len(d)
    nascent_umis_list.append(nascent_umis)
dat["UMIs.2Reads.Nascent.2TCs"] = nascent_umis_list
dat["UMIs.2Reads.Nascent.2TCs.Ratio"] = dat["UMIs.2Reads.Nascent.2TCs"] / dat["UMIs.2Reads"]

In [32]:
# Nascent gene number
genes_list = []
nascent_genes_list = []
for run, cell in dat[["Run", "Cell"]].values:
    genes = np.nan
    nascent_genes = np.nan
    path = "results/expression/quant_genes/min_read_2_min_tc_2/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        d = d[d["Total"] > 0]
        genes = len(set(filter(lambda x: "_" not in x, d.index)))
        d = d[d["Nascent"] > 0]
        nascent_genes = len(set(filter(lambda x: "_" not in x, d.index)))
    genes_list.append(genes)
    nascent_genes_list.append(nascent_genes)
dat["Genes.2Reads"] = genes_list
dat["Genes.2Reads.Nascent.2TCs"] = nascent_genes_list

In [33]:
dat.to_excel("data/NanoNASCseq_All_Summary.xlsx", index=False)

# Make data/NanoNASCseq_Summary.xlsx

In [34]:
d = pd.read_excel("data/NanoNASCseq_All_Summary.xlsx")
print("Input cells:", len(d))
for run in discarded_runs:
    d = d[d["Run"] != run]
d.to_excel("data/NanoNASCseq_Summary.xlsx", index=False)
print("Output cells:", len(d))

Input cells: 5569
Output cells: 4555
