In [11]:
import gzip
import os
import subprocess
import numpy as np
import pandas as pd

# Calculate half-life

    K562, mESC

In [13]:
info = pd.read_csv("reports/FLAIRseq_Summary.tsv", sep="\t")

def get_half_life(total, nascent):
    if total == 0:
        return np.nan
    else:
        if nascent == 0:
            return np.inf
        elif nascent == total:
            return 0
        else:
            return -3/np.log2(1-nascent/total)

In [14]:
params = [
    ["K562", 50, 0.008, "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.genes.tsv"],
    ["mESC", 400, 0.01, "/home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.genes.tsv"]
]

for cell_line, s4u, min_tc_ratio, gene_anno_path in params:
    d = info
    d = d[d["UMIs"] >= 5000]
    d = d[d["Genes"] >= 2000]
    d = d[d["CellLine"] == cell_line]
    d = d[d["TC.Ratio"] >= min_tc_ratio]
    d = d[d["Time"] == 3]
    d = d[d["s4U"] == s4u]
    d = d[d["ActD"].isna()]
    
    for min_tc in [1, 2]:
        print("-" * 80)
        print("Cell line:", cell_line)
        print("Gene annotation:", gene_anno_path)
        print("Min TC count:", min_tc)
        print("Cells:", len(d))

        anno = pd.read_csv(gene_anno_path, sep="\t", index_col=2)
        array1 = []
        array2 = []
        for run, cell in d[["Run", "Cell"]].values:
            path = "results/expression/quant_genes/min_read_2_min_tc_%s/%s/%s.tsv" % (min_tc, run, cell)
            df = pd.read_csv(path, sep="\t", index_col=0)
            s1 = df["Total"]
            s2 = df["Nascent"]
            s1.name = cell
            s2.name = cell
            array1.append(s1)
            array2.append(s2)
        df1 = pd.concat(array1, axis=1, sort=False).fillna(0)
        df2 = pd.concat(array2, axis=1, sort=False).fillna(0)
        s1 = df1.sum(axis=1)
        s2 = df2.sum(axis=1)
        s1.name = "Total"
        s2.name = "Nascent"

        df = pd.concat([s1, s2], axis=1, sort=False).fillna(0)
        df.index.name = "GeneID"
        df["TPM"] = df["Total"] * 1e6 / sum(df["Total"])
        df["NTR"] = df["Nascent"] / df["Total"]
        df["Halflife"] = [get_half_life(v1, v2) for v1, v2 in df[["Total", "Nascent"]].values]
        df["T"] = df["Halflife"]
        df["DecayRate"] = np.log(2)/df["Halflife"]
        df["SynthesisRate"] = df["TPM"] * df["DecayRate"] / 100
        df = df.merge(anno, left_index=True, right_index=True, how="right")
        df.to_csv("reports/halflife_of_%s_%duM_3h.%dTC.tsv" % (cell_line, s4u, min_tc), sep="\t")  

--------------------------------------------------------------------------------
Cell line: K562
Gene annotation: /home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.genes.tsv
Min TC count: 1
Cells: 97
--------------------------------------------------------------------------------
Cell line: K562
Gene annotation: /home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.genes.tsv
Min TC count: 2
Cells: 97
--------------------------------------------------------------------------------
Cell line: mESC
Gene annotation: /home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.genes.tsv
Min TC count: 1
Cells: 102
--------------------------------------------------------------------------------
Cell line: mESC
Gene annotation: /home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.genes.tsv
Min TC count: 2
Cells: 102


In [18]:
params = [
    ["K562", 50, 0.008, "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.tsv"],
    ["mESC", 400, 0.01, "/home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.annotation.tsv"]
]

for cell_line, s4u, min_tc_ratio, gene_anno_path in params:
    d = info
    d = d[d["UMIs"] >= 5000]
    d = d[d["Genes"] >= 2000]
    d = d[d["CellLine"] == cell_line]
    d = d[d["TC.Ratio"] >= min_tc_ratio]
    d = d[d["Time"] == 3]
    d = d[d["s4U"] == s4u]
    d = d[d["ActD"].isna()]
    
    for min_tc in [1, 2]:
        print("-" * 80)
        print("Cell line:", cell_line)
        print("Gene annotation:", gene_anno_path)
        print("Min TC count:", min_tc)
        print("Cells:", len(d))

        anno = pd.read_csv(gene_anno_path, sep="\t", index_col=0)
        array1 = []
        array2 = []
        for run, cell in d[["Run", "Cell"]].values:
            path = "results/expression/quant_isoforms/min_read_2_min_tc_%s/%s/%s.tsv" % (min_tc, run, cell)
            df = pd.read_csv(path, sep="\t", index_col=0)
            s1 = df["Total"]
            s2 = df["Nascent"]
            s1.name = cell
            s2.name = cell
            array1.append(s1)
            array2.append(s2)
        df1 = pd.concat(array1, axis=1, sort=False).fillna(0)
        df2 = pd.concat(array2, axis=1, sort=False).fillna(0)
        s1 = df1.sum(axis=1)
        s2 = df2.sum(axis=1)
        s1.name = "Total"
        s2.name = "Nascent"

        df = pd.concat([s1, s2], axis=1, sort=False).fillna(0)
        df.index.name = "TranscriptID"
        df["TPM"] = df["Total"] * 1e6 / sum(df["Total"])
        df["NTR"] = df["Nascent"] / df["Total"]
        df["T"] = [get_half_life(v1, v2) for v1, v2 in df[["Total", "Nascent"]].values]
        df = df.merge(anno, left_index=True, right_index=True, how="right")
        df.to_csv("reports/transcript_based.halflife_of_%s_%duM_3h.%dTC.tsv" % (cell_line, s4u, min_tc), sep="\t")  

--------------------------------------------------------------------------------
Cell line: K562
Gene annotation: /home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.tsv
Min TC count: 1
Cells: 97
--------------------------------------------------------------------------------
Cell line: K562
Gene annotation: /home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.tsv
Min TC count: 2
Cells: 97
--------------------------------------------------------------------------------
Cell line: mESC
Gene annotation: /home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.annotation.tsv
Min TC count: 1
Cells: 102
--------------------------------------------------------------------------------
Cell line: mESC
Gene annotation: /home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.annotation.tsv
Min TC count: 2
Cells: 102


# all cells

In [17]:
cell_lines = ["K562", "mESC"]

for cell_line in cell_lines:
    for group in ["all_cells", "control_cells", "treatment_cells"]:
        d = info
        d = d[(d["CellLine"] == cell_line) & (d["Time"] == 3) & (d["ActD"].isna()) & (d["UMIs"] >= 5000)]
        if cell_line == "K562":
            if group == "all_cells":
                d = d[(d["s4U"] == 0) | (d["s4U"] == 50)]
            elif group == "control_cells":
                d = d[(d["s4U"] == 0)]
            elif group == "treatment_cells":
                d = d[(d["TC.Ratio"] >= 0.008) & (d["s4U"] == 50)]
            else:
                assert False
            anno_path = "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.genes.tsv"
        elif cell_line == "mESC":
            if group == "all_cells":
                d = d[(d["s4U"] == 0) | (d["s4U"] == 400)]
            elif group == "control_cells":
                d = d[(d["s4U"] == 0)]
            elif group == "treatment_cells":
                d = d[(d["TC.Ratio"] >= 0.01) & (d["s4U"] == 400)]
            else:
                assert False
            anno_path = "/home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.genes.tsv"
        else:
            assert False
        anno = pd.read_csv(anno_path, sep="\t", index_col=2)
        print(cell_line, group, len(d), sep="\t")
        for tc in [1, 2]:
            outfile1 = "reports/gene_based/%s.%s.min_read_2_min_tc_%d.filelist.txt" % (cell_line, group, tc)
            outfile2 = "reports/gene_based/%s.%s.min_read_2_min_tc_%d.tsv" % (cell_line, group, tc)
            outfile3 = "reports/gene_based/%s.%s.min_read_2_min_tc_%d.annotated.tsv" % (cell_line, group, tc)
            with open(outfile1, "w+") as fw:
                for run, cell in d[["Run", "Cell"]].values:
                    path = "results/expression/quant_genes/min_read_2_min_tc_%d/%s/%s.tsv" % (tc, run, cell)
                    fw.write(path + "\n")
            cmd = "./scripts/merge_counts.py %s %s" % (outfile1, outfile2)
            subprocess.check_call(cmd, shell=True)
            dat = pd.read_csv(outfile2, sep="\t", header=0, index_col=0)
            dat["NTR"] = dat["Nascent"] / dat["Total"]
            dat["TPM"] = dat["Total"] * 1e6 / sum(dat["Total"])
            dat["T"] = [get_half_life(total, nascent) for total, nascent in dat[["Total", "Nascent"]].values]
            dat = dat.merge(anno, left_index=True, right_index=True, how="left")
            dat.to_csv(outfile3, sep="\t")
            os.remove(outfile1)
            os.remove(outfile2)

K562	all_cells	369
K562	control_cells	190
K562	treatment_cells	105
mESC	all_cells	176
mESC	control_cells	64
mESC	treatment_cells	110


In [12]:
cell_lines = ["K562", "mESC"]

for cell_line in cell_lines:
    for group in ["all_cells", "control_cells", "treatment_cells"]:
        d = info
        d = d[(d["CellLine"] == cell_line) & (d["Time"] == 3) & (d["ActD"].isna()) & (d["UMIs"] >= 5000)]
        
        if cell_line == "K562":
            if group == "all_cells":
                d = d[(d["s4U"] == 0) | (d["s4U"] == 50)]
            elif group == "control_cells":
                d = d[(d["s4U"] == 0)]
            elif group == "treatment_cells":
                d = d[(d["s4U"] == 50) & (d["TC.Ratio"] >= 0.008)]
            else:
                assert False
            anno_path = "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.tsv"
        elif cell_line == "mESC":
            if group == "all_cells":
                d = d[(d["s4U"] == 0) | (d["s4U"] == 400)]
            elif group == "control_cells":
                d = d[(d["s4U"] == 0)]
            elif group == "treatment_cells":
                d = d[(d["s4U"] == 400) & (d["TC.Ratio"] >= 0.01)]
            else:
                assert False
            anno_path = "/home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.annotation.tsv"
        else:
            assert False
            
        anno = pd.read_csv(anno_path, sep="\t", index_col=0)
        
        print(cell_line, group, len(d), sep="\t")

        for tc in [1, 2]:
            outfile1 = "reports/transcript_based/%s.%s.min_read_2_min_tc_%d.filelist.txt" % (cell_line, group, tc)
            outfile2 = "reports/transcript_based/%s.%s.min_read_2_min_tc_%d.tsv" % (cell_line, group, tc)
            outfile3 = "reports/transcript_based/%s.%s.min_read_2_min_tc_%d.annotated.tsv" % (cell_line, group, tc)

            with open(outfile1, "w+") as fw:
                for run, cell in d[["Run", "Cell"]].values:
                    path = "results/expression/quant_isoforms/min_read_2_min_tc_%d/%s/%s.tsv" % (tc, run, cell)
                    fw.write(path + "\n")
            cmd = "./scripts/merge_counts.py %s %s" % (outfile1, outfile2)
            subprocess.check_call(cmd, shell=True)
            dat = pd.read_csv(outfile2, sep="\t", header=0, index_col=0)
            dat["NTR"] = dat["Nascent"] / dat["Total"]
            dat["TPM"] = dat["Total"] * 1e6 / sum(dat["Total"])
            dat["T"] = [get_half_life(total, nascent) for total, nascent in dat[["Total", "Nascent"]].values]
            dat = dat.merge(anno, left_index=True, right_index=True, how="left")
            dat.to_csv(outfile3, sep="\t")
            os.remove(outfile1)
            os.remove(outfile2)

K562	all_cells	369
K562	control_cells	190
K562	treatment_cells	105
mESC	all_cells	176
mESC	control_cells	64
mESC	treatment_cells	110
