In [1]:
import os
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
d = pd.read_csv("../8_embryo/results/seurat_transcripts/metadata_and_celltype.consistent_celltype.tsv", sep="\t")
d = d[(d["s4U"] == 400) & (d["Time"] == 3) & (d["ActD"].isna()) & (d["TCRatio"] >= 0.002)]
d = d[(d["CellType"] == "muralTE") | (d["CellType"] == "poralTE") | (d["CellType"] == "EPI") | (d["CellType"] == "PE")]

In [3]:
anno_g = pd.read_csv("/home/chenzonggui/species/mus_musculus/GRCm38.p6/gencode.vM25.genes.tsv", sep="\t")
gid2gtype = dict()
gid2gname = dict()
for gid, gtype, gname in anno_g[["GeneID", "GeneType", "GeneName"]].values:
    gid2gtype[gid] = gtype
    gid2gname[gid] = gname

In [5]:
anno = pd.read_csv("../8_embryo/results/mouse_transcript_annotation.tsv", sep="\t", header=0, index_col=0)
anno.head()
tid2tname = dict()
for tid, tname in zip(anno.index, anno["TranscriptName"]):
    if not isinstance(tname, str) and np.isnan(tname):
        continue
    tid2tname[tid] = tname

In [10]:
outfile1 = "results/blastocyst_counts.genes.filelist.txt"
outfile2 = "results/blastocyst_counts.genes.gene_id.total.tsv"
outfile3 = "results/blastocyst_counts.genes.gene_id.nascent.tsv"
outfile4 = "results/blastocyst_counts.genes.gene_id.meta.tsv"
outfile5 = "results/blastocyst_counts.genes.gene_name.total.tsv"
outfile6 = "results/blastocyst_counts.genes.gene_name.nascent.tsv"
outfile7 = "results/blastocyst_counts.genes.gene_name.meta.tsv"

if not os.path.exists(outfile2):
    with open(outfile1, "w+") as fw:
        for run, cell in d[["Run", "Cell"]].values:
            path = "../../1_NanoNASCseq/results/expression/quant_genes/min_read_2_min_tc_2/%s/%s.tsv" % (run, cell)
            fw.write(path + "\n")
    cmd = "../../1_NanoNASCseq/scripts/make_count_matrix.py %s %s %s" % (outfile1, outfile2, outfile3)
    subprocess.check_call(cmd, shell=True)
    d.to_csv(outfile4, sep="\t", index=False)
    os.remove(outfile1)

if not os.path.exists(outfile5):
    m = pd.read_csv(outfile2, sep="\t", header=0, index_col=0)
    m.index = [gid2gname.get(x, x) for x in m.index]
    m.index.name = "GeneName"
    m = m[~m.index.duplicated()]
    m.to_csv(outfile5, sep="\t")

    m = pd.read_csv(outfile3, sep="\t", header=0, index_col=0)
    m.index = [gid2gname.get(x, x) for x in m.index]
    m.index.name = "GeneName"
    m = m[~m.index.duplicated()]
    m.to_csv(outfile6, sep="\t")

    d.to_csv(outfile7, sep="\t", index=False)

In [10]:
outfile1 = "results/blastocyst_counts.transcripts.filelist.txt"
outfile2 = "results/blastocyst_counts.transcripts.transcript_id.total.tsv"
outfile3 = "results/blastocyst_counts.transcripts.transcript_id.nascent.tsv"
outfile4 = "results/blastocyst_counts.transcripts.transcript_id.meta.tsv"
outfile5 = "results/blastocyst_counts.transcripts.transcript_name.total.tsv"
outfile6 = "results/blastocyst_counts.transcripts.transcript_name.nascent.tsv"
outfile7 = "results/blastocyst_counts.transcripts.transcript_name.meta.tsv"

if not os.path.exists(outfile2):
    with open(outfile1, "w+") as fw:
        for run, cell in d[["Run", "Cell"]].values:
            path = "../../1_NanoNASCseq/results/expression/novel/quant_isoforms/min_read_2_min_tc_2/%s/%s.tsv" % (run, cell)
            fw.write(path + "\n")
    cmd = "../../1_NanoNASCseq/scripts/make_count_matrix.py %s %s %s" % (outfile1, outfile2, outfile3)
    subprocess.check_call(cmd, shell=True)
    d.to_csv(outfile4, sep="\t", index=False)
    os.remove(outfile1)

if not os.path.exists(outfile5):
    m = pd.read_csv(outfile2, sep="\t", header=0, index_col=0)
    m.index = [tid2tname.get(x, x) for x in m.index]
    m.index.name = "TranscriptName"
    m.to_csv(outfile5, sep="\t")

    m = pd.read_csv(outfile3, sep="\t", header=0, index_col=0)
    m.index = [tid2tname.get(x, x) for x in m.index]
    m.index.name = "TranscriptName"
    m.to_csv(outfile6, sep="\t")

    d.to_csv(outfile7, sep="\t", index=False)

# Cell cycle genes

In [None]:
genes_s = [
    "Mcm5", "Pcna", "Tyms", "Fen1", "Mcm2", "Mcm4", "Rrm1", "Ung", "Gins2", "Mcm6", 
    "Cdca7", "Dtl", "Prim1", "Uhrf1", "Cenpu", "Hells", "Rfc2", "Rpa2", "Nasp", "Rad51ap1", 
    "Gmnn", "Wdr76", "Slbp", "Ccne2", "Ubr7", "Pold3", "Msh2", "Atad2", "Rad51", "Rrm2", 
    "Cdc45", "Cdc6", "Exo1", "Tipin", "Dscc1", "Blm", "Casp8ap2", "Usp1", "Clspn", "Pola1", 
    "Chaf1b", "Brip1", "E2f8"]

genes_g2m = [
    "Hmgb2", "Cdk1", "Nusap1", "Ube2c", "Birc5", "Tpx2", "Top2a", "Ndc80", "Cks2", "Nuf2", 
    "Cks1b", "Mki67", "Tmpo", "Cenpf", "Tacc3", "4930401O10Rik", "Smc4", "Ccnb2", "Ckap2l", "Ckap2", 
    "Aurkb", "Bub1", "Kif11", "Anp32e", "Tubb4b", "Gtse1", "Kif20b", "Hjurp", "Cdca3", "Jpt1", 
    "Cdc20", "Ttk", "Cdc25c", "Kif2c", "Rangap1", "Ncapd2", "Dlgap5", "Cdca2", "Cdca8", "Ect2", 
    "Kif23", "Hmmr", "Aurka", "Psrc1", "Anln", "Lbr", "Ckap5", "Cenpe", "Ctcf", "Nek2", 
    "G2e3", "Gas2l3", "Cbx5", "Cenpa"]

In [62]:
mtx = pd.read_csv("results/blastocyst_counts.transcripts.transcript_name.total.tsv", sep="\t", index_col=0)
expressed_isoforms = set(mtx.index)

In [63]:
tnames1 = []
tnames2 = []
for tname in anno["TranscriptName"]:
    if tname not in expressed_isoforms:
        continue
    if "novel" in tname:
        gname = tname.split("novel")[0][:-1]
    else:
        gname = tname[:-4]
    if gname in genes_s:
        tnames1.append(tname)
    if gname in genes_g2m:
        tnames2.append(tname)

In [66]:
with open("results/cellcycle_s.transcript_name.tsv", "w+") as fw:
    for x in tnames1:
        fw.write(x + "\n")

In [67]:
with open("results/cellcycle_g2m.transcript_name.tsv", "w+") as fw:
    for x in tnames2:
        fw.write(x + "\n")

In [71]:
m1 = pd.read_csv("results/blastocyst_counts.transcripts.transcript_name.total.tsv", sep="\t", index_col=0)
m1 = m1.loc[tnames1 + tnames2]

m2 = pd.read_csv("results/blastocyst_counts.transcripts.transcript_name.nascent.tsv", sep="\t", index_col=0)
m2 = m2.loc[tnames1 + tnames2]

In [72]:
m1.to_csv("results/blastocyst_counts.cellcycle.transcripts.transcript_name.total.tsv", sep="\t")
m2.to_csv("results/blastocyst_counts.cellcycle.transcripts.transcript_name.nascent.tsv", sep="\t")