In [1]:
import os
import numpy as np
import pandas as pd

# Report summary of cells

In [10]:
def divide(a, b):
    return None if (b is None or b == 0) else a / b
    
dat = pd.read_excel("data/FLAIRseq.xlsx")

In [11]:
# Cell reads and trimmed reads

cell_reads_list = []
trim_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    cell_reads = np.nan
    trim_reads = np.nan
    path = "results/demux/trimmed/%s/%s/stats.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        cell_reads = df["Total"].values[0]
        trim_reads = df["Pass"].values[0]
    cell_reads_list.append(cell_reads)
    trim_reads_list.append(trim_reads)
dat["Cell.Reads"] = cell_reads_list
dat["Trimmed.Reads"] = trim_reads_list
dat["Trimmed.Ratio"] = dat["Trimmed.Reads"] / dat["Cell.Reads"]

In [12]:
# Mapped reads

mapped_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    mapped_reads = np.nan
    path = "results/mapping/minimap2/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        mapped_reads = int(open(path).readlines()[7].split()[0])
    mapped_reads_list.append(mapped_reads)
dat["Mapped.Reads"] = mapped_reads_list
dat["Mapped.Ratio"] = dat["Mapped.Reads"] / dat["Trimmed.Reads"]

In [13]:
# Mitochondrion reads ratio

mito_ratio_list = []
for run, cell in dat[["Run", "Cell"]].values:
    mito_ratio = np.nan
    path = "results/mapping/chrom_reads/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        try:
            v1, v2 = 0, 0
            for line in open(path):
                chrom, count = line.strip("\n").split("\t")
                count = int(count)
                v1 += count
                if chrom == "chrM":
                    v2 = count
            mito_ratio = divide(v2, v1)
        except ValueError:
            print(path)
    mito_ratio_list.append(mito_ratio)
dat["Mito.Ratio"] = mito_ratio_list

In [14]:
# Filtered reads

filtered_reads_list = []
filtered_clip_reads_list = []
for run, cell in dat[["Run", "Cell"]].values:
    filtered_reads = np.nan
    filtered_clip_reads = np.nan
    path = "results/mapping/stat_clip/%s/%s.log" % (run, cell)
    if os.path.exists(path):
        lines = open(path).readlines()
        for i, line in enumerate(lines):
            if line.startswith("Input"):
                v1, v2 = lines[i + 1].split("\t")[:2]
                filtered_reads = int(v1)
                filtered_clip_reads = int(v2)
                break
    filtered_reads_list.append(filtered_reads)
    filtered_clip_reads_list.append(filtered_clip_reads)
dat["Filtered.Reads"] = filtered_reads_list
dat["Filtered.Ratio"] = dat["Filtered.Reads"] / dat["Mapped.Reads"]
dat["FilteredClip.Reads"] = filtered_clip_reads_list
dat["FilteredClip.Ratio"] = dat["FilteredClip.Reads"] / dat["Filtered.Reads"]

In [15]:
# Duplicate reads

umis1_list = []
umis2_list = []
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/mapping/mark_duplicate/%s/%s.tsv" % (run, cell)
    umis1 = np.nan
    umis2 = np.nan
    if os.path.exists(path):
        d1 = pd.read_csv(path, sep="\t", header=0)
        umis1 = len(d1)
        umis2 = len(d1[d1["AllSize"] >= 2])
    umis1_list.append(umis1)
    umis2_list.append(umis2)
dat["UMIs"] = umis1_list
dat["UMIs.2Reads"] = umis2_list
dat["Duplicate.Reads"] = dat["FilteredClip.Reads"] - dat["UMIs"]
dat["Duplicate.Ratio"] = dat["Duplicate.Reads"] / dat["FilteredClip.Reads"]
dat["Unique.Reads"] = dat["UMIs"]

In [16]:
# Detected genes

genes_list = []
for run, cell in dat[["Run", "Cell"]].values:
    genes = np.nan
    path = "results/expression/quant_genes/min_read_1_min_tc_1/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        d = d[d["Total"] > 0]
        genes = len(set(filter(lambda x: "_" not in x, d.index)))
    genes_list.append(genes)
dat["Genes"] = genes_list

In [17]:
# Detected isoforms

sc_list = set(['full-splice_match', 'incomplete-splice_match', 'novel_in_catalog', 'novel_not_in_catalog'])
isoforms1_list = []
isoforms2_list = []
for run, cell in dat[["Run", "Cell"]].values:
    isoforms1 = np.nan # number of assemblied isoforms
    isoforms2 = np.nan # number of known isoforms
    path = "results/assembly/sqanti3/%s/%s/%s_classification.txt" % (run, cell, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        isoforms1 = len(d)
        d = d[[sc in sc_list for sc in d["structural_category"]]]
        isoforms2 = len(d)
    isoforms1_list.append(isoforms1)
    isoforms2_list.append(isoforms2)
dat["Isoforms.Assembled"] = isoforms1_list
dat["Isoforms.Known"] = isoforms2_list

In [18]:
# Mismatch ratios

mtypes = []
for base1 in "ACGT":
    for base2 in "ACGT":
        if base1 != base2:
            mtypes.append("%s%s" % (base1, base2))
ratios = dict()
for mtype in mtypes:
    ratios[mtype] = list()
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/mismatch/ratio_consensus/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t", index_col=0)
        for mtype in mtypes:
            ratios[mtype].append(df.loc[mtype]["Ratio"])
    else:
        for mtype in mtypes:
            ratios[mtype].append(np.nan)
for mtype in mtypes:
    dat["%s.Ratio" % mtype] = ratios[mtype]

In [19]:
# Pe and Pc

pe_list = []
pc_list = []
for run, cell in dat[["Run", "Cell"]].values:
    pe = np.nan
    pc = np.nan
    path = "results/signal2noise/pc/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        pe, pc, pc_pe = d.iloc[0]
    pe_list.append(pe)    
    pc_list.append(pc)
dat["Pe"] = pe_list
dat["Pc"] = pc_list
dat["Pc_Pe"] = dat["Pc"] / dat["Pe"]

In [20]:
# Nascent UMIs

nascent_umis_list = []
for run, cell in dat[["Run", "Cell"]].values:
    nascent_umis = np.nan
    path = "results/mismatch/ratio_consensus/%s/%s.events.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", index_col=0)
        d = d[(d["Size"] >= 2) & (d["T-C"] >= 2)]
        nascent_umis = len(d)
    nascent_umis_list.append(nascent_umis)
dat["UMIs.2Reads.Nascent.2TCs"] = nascent_umis_list
dat["UMIs.2Reads.Nascent.2TCs.Ratio"] = dat["UMIs.2Reads.Nascent.2TCs"] / dat["UMIs.2Reads"]

In [21]:
# Nascent gene number

genes_list = []
nascent_genes_list = []
for run, cell in dat[["Run", "Cell"]].values:
    genes = np.nan
    nascent_genes = np.nan
    path = "results/expression/quant_genes/min_read_2_min_tc_2/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t", header=0, index_col=0)
        d = d[d["Total"] > 0]
        genes = len(set(filter(lambda x: "_" not in x, d.index)))
        d = d[d["Nascent"] > 0]
        nascent_genes = len(set(filter(lambda x: "_" not in x, d.index)))
    genes_list.append(genes)
    nascent_genes_list.append(nascent_genes)
dat["Genes.2Reads"] = genes_list
dat["Genes.2Reads.Nascent.2TCs"] = nascent_genes_list

In [22]:
dat.to_csv("reports/FLAIRseq_Summary.tsv", index=False, sep="\t")

# Number of cells

In [2]:
d = pd.read_csv("reports/FLAIRseq_Summary.tsv", sep="\t")
# d = d[np.isnan(d["ActD"])]
d = d[d["UMIs"] >= 5000]
df1 = d[d["Group"] == "K562"]
df2 = d[d["Group"] == "mESC"]
df3 = d[d["Group"] == "MouseBlastocyst"]
print("K562:", len(df1))
print("mESC:", len(df2))
print("Blastocyst:", len(df3))

for df in [df1, df2, df3]:
    print()
    print("-" * 40)
    print("4sU\tTime\tCells\tActD(-)\tActD(+)")
    print("-" * 40)
    for s4u, d1 in df.groupby(by="s4U"):
        for t, d2 in d1.groupby(by="Time"):
            n1 = sum(np.isnan(d2["ActD"]))
            n2 = sum(~np.isnan(d2["ActD"]))
            print(s4u, t, len(d2), n1, n2, sep="\t")

K562: 878
mESC: 255
Blastocyst: 2661

----------------------------------------
4sU	Time	Cells	ActD(-)	ActD(+)
----------------------------------------
0	3.0	339	190	149
50	0.25	32	32	0
50	0.5	51	51	0
50	1.0	41	41	0
50	2.0	44	44	0
50	3.0	301	179	122
100	3.0	19	19	0
200	3.0	11	11	0
400	3.0	17	17	0
500	3.0	23	23	0

----------------------------------------
4sU	Time	Cells	ActD(-)	ActD(+)
----------------------------------------
0	3.0	64	64	0
50	3.0	79	79	0
400	3.0	112	112	0

----------------------------------------
4sU	Time	Cells	ActD(-)	ActD(+)
----------------------------------------
0	3.0	153	143	10
100	3.0	10	10	0
200	3.0	42	42	0
300	3.0	34	34	0
400	3.0	2348	2245	103
500	3.0	41	41	0
600	3.0	33	33	0
