In [1]:
import os
from collections import Counter
import numpy as np
import pandas as pd

# Number of cells

In [2]:
dat = pd.read_excel("data/NASCseq.xlsx")
dat = dat[dat["Method"] == "NASC-seq"]
dat = dat[[str(run).startswith("2022") for run in dat["Run"]]]
counter = Counter([tuple(vs) for vs in dat[["s4U", "Time"]].values])
print("s4U\tTime\tCount")
for (s4u, time), count in sorted(counter.items()):
    print(s4u, time, count, sep="\t")

s4U	Time	Count
0.0	3.0	39
50.0	2.0	10
50.0	3.0	38
100.0	2.0	10
100.0	3.0	12
200.0	2.0	12
200.0	3.0	9
500.0	2.0	13


# Report summary

In [3]:
def str2int(s):
    return int(s.replace(",", ""))
    
dat = pd.read_excel("data/NASCseq.xlsx")

In [4]:
# trimming
vs1 = [] # total reads
vs2 = [] # trimmed reads
for run, cell in dat[["Run", "Cell"]].values:
    total_reads = 0
    trimmed_reads = 0
    path = "results/prepare/cutadapt/%s/%s.log" % (run, cell)
    if os.path.exists(path):
        with open(path) as f:
            for line in f:
                if "Total reads processed:" in line or "Total read pairs processed:" in line:
                    total_reads = str2int(line.strip("\n").split()[-1])
                if "Reads written (passing filters):" in line or "Pairs written (passing filters):" in line:
                    trimmed_reads = str2int(line.strip("\n").split()[-2])
    vs1.append(total_reads)
    vs2.append(trimmed_reads)
dat["Total.Reads"] = vs1
dat["Trimmed.Reads"] = vs2
dat["Trimmed.Reads.Ratio"] = dat["Trimmed.Reads"] / dat["Total.Reads"]

In [6]:
# mapping
vs1 = [] # reads
vs2 = [] # uniq mapped
for run, cell in dat[["Run", "Cell"]].values:
    reads = 0
    uniq_mapped = 0
    path = "results/mapping/star/%s/%s/%s.Log.final.out" % (run, cell, cell)
    if os.path.exists(path):
        with open(path) as f:
            for line in f:
                if "Number of input reads" in line:
                    reads = int(line.strip().split()[-1])
                if "Uniquely mapped reads number" in line:
                    uniq_mapped = int(line.strip().split()[-1])
    vs1.append(reads)
    vs2.append(uniq_mapped)
dat["RiboRNA.Ratio"] = 1 - np.array(vs1) / dat["Trimmed.Reads"]
dat["Clean.Reads"] = vs1
dat["UniqMapped.Reads"] = vs2
dat["UniqMapped.Ratio"] = dat["UniqMapped.Reads"] / dat["Clean.Reads"]

In [7]:
# filtering
vs = []
for run, cell, layout in dat[["Run", "Cell", "Layout"]].values:
    reads = 0
    path = "results/mapping/filtered/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            if "in total" in line:
                reads = int(line.split()[0])
    if layout == "PE":
        reads = int(reads / 2)
    vs.append(reads)
dat["Filtered.Reads"] = vs

In [8]:
# remove duplicates
vs = []
for run, cell, layout in dat[["Run", "Cell", "Layout"]].values:
    reads = 0
    path = "results/mapping/rmdup/%s/%s.flagstat" % (run, cell)
    if os.path.exists(path):
        for line in open(path):
            if "in total" in line:
                reads = int(line.split()[0])
    if layout == "PE":
        reads = int(reads / 2)
    vs.append(reads)
dat["Uniq.Reads"] = vs
dat["Uniq.Ratio"] = dat["Uniq.Reads"] / dat["Filtered.Reads"]

In [9]:
# mark strand
vs1, vs2, vs3 = [], [], []
for run, cell in dat[["Run", "Cell"]].values:
    # mark strand
    pos, neg, strand_perc = 0, 0, 0
    path = "results/mapping/marked_strand/%s/%s.tsv" % (run, cell)
    if os.path.exists(path):
        d = pd.read_csv(path, sep="\t")
        vs = d.iloc[0].values
        pos, neg = vs[1], vs[2]
        strand_perc = np.divide(sum(vs[1:3]), sum(vs[1:5]))
    vs1.append(pos)
    vs2.append(neg)
    vs3.append(strand_perc)
dat["Stranded.Reads"] = np.array(vs1) + np.array(vs2)
dat["Forward"], dat["Reverse"], dat["Stranded.Ratio"] = vs1, vs2, vs3

In [10]:
# mismatch ratio
mtypes = []
for b1 in "ACGT":
    for b2 in "ACGT":
        if b1 != b2:
            mtypes.append("%s%s" % (b1, b2))
rows = []
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/mismatch/ratio/%s/%s.tsv" % (run, cell)
    d = pd.read_csv(path, sep="\t", index_col=0)
    rows.append([d.loc[mt]["Ratio.NoSNP"] for mt in mtypes])
tmp = pd.DataFrame(rows, columns=["%s.Ratio" % mt for mt in mtypes])
for c in tmp.columns:
    dat[c] = tmp[c]

In [11]:
# genes
array = []
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/expression/genes/%s/%s.tsv" % (run, cell)
    array.append(pd.read_csv(path, sep="\t"))
tmp = pd.concat(array, axis=0, ignore_index=True)
for c in tmp.columns:
   dat["Genes.%s" % c] = tmp[c]

In [14]:
# Pe and Pc
array = []
for run, cell in dat[["Run", "Cell"]].values:
    path = "results/signal2noise/pc/%s/%s.tsv" % (run, cell)
    array.append(pd.read_csv(path, sep="\t"))
tmp = pd.concat(array, axis=0, ignore_index=True)
tmp.columns = ["Pe", "Pc", "Pc_Pe"]
for c in tmp.columns:
    dat[c] = tmp[c]

In [16]:
dat.to_csv("reports/NASCseq_Summary.tsv", index=False, sep="\t")