# Common import

In [5]:
import os
import json
import gzip
from pyBioInfo.IO.File import BedFile, GtfFile, GtfGeneBuilder, GtfTranscriptBuilder
from pyBioInfo.Utils import ShiftLoader
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import rcParams
rcParams["font.family"] = "arial"
samples = ["Adult_Female", "Adult_Male", "Juvenile"]

# 2021-03-24 Summary

In [358]:
def get_raw_read(sample):
    path = "results/isoseq/ccs/%s.ccs_report.txt" % sample
    with open(path) as f:
        for line in f:
            if "ZMWs input" in line:
                return int(line.strip("\n").split()[-1])
            
def get_ccs_read(sample):
    path = "results/isoseq/ccs/%s.ccs_report.txt" % sample
    with open(path) as f:
        for line in f:
            if "ZMWs pass filters" in line:
                return int(line.strip("\n").split()[-2])

def get_lima_read(sample):
    path = "results/isoseq/demux/%s.lima.summary" % sample
    with open(path) as f:
        for line in f:
            if "ZMWs above all thresholds" in line:
                return int(line.strip("\n").split()[-2])
    
def get_flnc_read(sample):
    path = "results/isoseq/flnc/%s.filter_summary.json" % sample
    with open(path) as f:
        for line in f:
            js = json.loads(line)
            return js["num_reads_flnc"]
        
def get_flnc_polya_read(sample):
    path = "results/isoseq/flnc/%s.filter_summary.json" % sample
    with open(path) as f:
        for line in f:
            js = json.loads(line)
            return js["num_reads_flnc_polya"]
        
def get_cluster(sample):
    path = "results/isoseq/clustered/%s.cluster_report.csv" % sample
    # return 0
    array = []
    with open(path) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            array.append(line.strip("\n").split(",")[0])
    return len(set(array))

def get_aligned(sample):
    path = "results/isoseq/aligned/%s.stats" % sample
    with open(path) as f:
        for line in f:
            if "Total reads:" in line:
                return int(line.strip("\n").split()[-1])
    
dat = pd.DataFrame(index=pd.Index(samples, name="Sample"))
dat["RawRead"] = list(map(get_raw_read, samples))
dat["CCS"] = list(map(get_ccs_read, samples))
dat["Lima"] = list(map(get_lima_read, samples))
dat["FLNC"] = list(map(get_flnc_read, samples))
dat["FLNC_polyA"] = list(map(get_flnc_polya_read, samples))
dat["Cluster"] = list(map(get_cluster, samples))
dat["Aligned"] = list(map(get_aligned, samples))
dat

Unnamed: 0_level_0,RawRead,CCS,Lima,FLNC,FLNC_polyA,Cluster,Aligned
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adult_Female,5759266,3971860,2767549,2725947,2706469,213849,219085
Adult_Male,5215426,3509670,2364704,2160279,2142476,170298,173656
Juvenile,6432352,4133860,2821785,2691254,2666770,188165,190721


In [359]:
dat.to_csv("summary.tsv", sep="\t")

# 2021-03-24 Cluster profile

In [354]:
for sample in samples:
    counter = defaultdict(int)
    with open("results/isoseq/clustered/%s.cluster_report.csv" % sample) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            name = line.strip("\n").split(",")[0]
            counter[name] += 1
    counter = Counter(counter.values())
    
    xs = np.arange(0, 20)
    ys = [counter[x] for x in xs]
    ys = np.array(ys) / 1000
    
    plt.figure(figsize=(3, 3))
    plt.bar(xs, ys, width=0.8, color="dodgerblue")
    plt.xlim(0, 20)
    plt.xlabel("FLNC per cluster")
    plt.ylim(0, 120)
    plt.ylabel("Number (x10$^3$)")
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)
    plt.tight_layout()
    # plt.show()
    plt.savefig("figures/cluster.profile.%s.png" % sample, dpi=300)
    plt.close()

# 2021-03-24 Plot length distribution

In [4]:
def run(path1, path2, prefix):
    polymerase_lengths = dict()
    with open(path1) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            name, length = line.strip("\n").split("\t")
            polymerase_lengths[name] = int(length)

    rows = []
    with open(path2) as f:
        for line in f:
            v1, v2, v3 = line.strip("\n").split("\t")
            v2 = int(v2)
            rows.append([v1, v2])

    container = defaultdict(list)
    for v1, v2 in rows:
        name = v1.split("/")[1]
        container[name].append(v2)

    matrix = []
    for zmw, values in container.items():
        pl = polymerase_lengths[zmw]
        sl = np.median(values)
        n = len(values)
        matrix.append([pl, sl, n])

    temp = matrix
    vs1 = np.log10([vs[0] for vs in temp]) # polymerase
    vs2 = np.log10([vs[1] for vs in temp]) # subreads
    vs3 = np.log2([vs[2]for vs in temp])   # subread count
    
    m = np.zeros((600, 600))
    for x, y in zip(vs1, vs2):
        i = int(x * 100)
        j = int(y * 100)
        if i < 600 and j < 600:
            m[i][j] += 1
    plt.figure(figsize=(3, 3))
    plt.hist2d(vs1, vs2, bins=(600, 600), cmap="Reds", norm=colors.LogNorm(1, m.max()))
    plt.xlim(0, 6)
    plt.xlabel("Polymerase length")
    xs = [0, 1, 2, 3, 4, 5, 6]
    plt.xticks(xs, ["10$^{%d}$" % x for x in xs])
    plt.ylim(0, 6)
    ys = [0, 1, 2, 3, 4, 5, 6]
    plt.yticks(ys, ["10$^{%d}$" % y for y in ys])
    plt.ylabel("Subread length")
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".1.png", dpi=300)
    plt.close()
    
    m = np.zeros((600, 1000))
    for x, y in zip(vs1, vs3):
        i = int(x * 100)
        j = int(y * 100)
        if i < 600 and j < 1000:
            m[i][j] += 1
    plt.figure(figsize=(3, 3))
    plt.hist2d(vs1, vs3, bins=(600, 1000), cmap="Reds", norm=colors.LogNorm(1, m.max()))
    plt.xlim(0, 6)
    plt.xlabel("Polymerase length")
    xs = [0, 1, 2, 3, 4, 5, 6]
    plt.xticks(xs, ["10$^{%d}$" % x for x in xs])
    plt.ylim(-1, 10)
    ys = np.arange(0, 11, 2)
    plt.yticks(ys, [2 ** y for y in ys])
    plt.ylabel("Subread count")
    # plt.yticks([0, 2, 4, 6])
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".2.png", dpi=300)
    plt.close()
    
    m = np.zeros((600, 1000))
    for x, y in zip(vs2, vs3):
        i = int(x * 100)
        j = int(y * 100)
        if i < 600 and j < 1000:
            m[i][j] += 1
    plt.figure(figsize=(3, 3))
    plt.hist2d(vs2, vs3, bins=(600, 1000), cmap="Reds", norm=colors.LogNorm(1, m.max()))
    plt.xlim(0, 6)
    plt.xlabel("Subread length")
    xs = [0, 1, 2, 3, 4, 5, 6]
    plt.xticks(xs, ["10$^{%d}$" % x for x in xs])
    plt.ylim(-1, 10)
    ys = np.arange(0, 11, 2)
    plt.yticks(ys, [2 ** y for y in ys])
    plt.ylabel("Subread count (median)")
    # plt.yticks([0, 2, 4, 6])
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".3.png", dpi=300)
    plt.close()

    xs = np.arange(100)
    ys = np.zeros(100)
    for row in matrix:
        i = int(row[0] / 1000)
        if i < 100:
            ys[i] += 1
    m1 = np.mean([vs[0] for vs in matrix])
    m2 = np.median([vs[0] for vs in matrix])
    ys = ys / 1000
    plt.figure(figsize=(3, 3))
    plt.bar(xs + 0.5, ys, width=1, color="dodgerblue")
    plt.text(20, 180, "width = 1000bp")
    plt.text(20, 160, "mean = %.2fkb" % (m1 / 1000))
    plt.text(20, 140, "median = %.2fkb" % (m2 / 1000))
    plt.xlim(0, 100)
    vs = [0, 20, 40, 60, 80, 100]
    plt.xticks(vs)
    plt.xlabel("Polymerase length (kb)")
    plt.ylim(0, 200)
    plt.yticks([0, 50, 100, 150, 200])
    plt.ylabel("Number (x10$^3$)")
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".4.png", dpi=300)
    plt.close()

    xs = np.arange(100)
    ys = np.zeros(100)
    for row in matrix:
        i = int(row[1] / 100)
        if i < 100:
            ys[i] += 1
    ys = ys / 1000
    m1 = np.mean([vs[1] for vs in matrix])
    m2 = np.median([vs[1] for vs in matrix])
    plt.figure(figsize=(3, 3))
    plt.bar(xs + 0.5, ys, width=1, color="dodgerblue")
    plt.text(10, 380, "width = 100bp")
    plt.text(10, 345, "mean = %.2fkb" % (m1 / 1000))
    plt.text(10, 310, "median = %.2fkb" % (m2 / 1000))
    plt.xlim(0, 100)
    vs = [0, 20, 40, 60, 80, 100]
    plt.xticks(vs, [int(v / 10) for v in vs])
    plt.xlabel("Subread length (kb)")
    plt.ylim(0, 400)
    plt.yticks([0, 100, 200, 300, 400])
    plt.ylabel("Number (x10$^3$)")
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".5.png", dpi=300)
    plt.close()

    xs = np.arange(20)
    ys = np.zeros(20)
    for row in matrix:
        i = row[2]
        if i < 20:
            ys[i] += 1
    ys = ys / 1000000
    plt.figure(figsize=(3, 3))
    plt.bar(xs, ys, width=0.8, color="dodgerblue")
    # plt.text(10, 380, "width = 100bp")
    # plt.text(10, 345, "mean = %.2fkb" % (m1 / 1000))
    # plt.text(10, 310, "median = %.2fkb" % (m2 / 1000))
    plt.xlim(0, 20)
    # vs = [0, 20, 40, 60, 80, 100]
    # plt.xticks(vs, [int(v / 10) for v in vs])
    plt.xlabel("Subread number")
    plt.ylim(0, 1.6)
    plt.yticks([0, 0.4, 0.8, 1.2, 1.6])
    plt.ylabel("Number (x10$^6$)")
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)
    plt.tight_layout()
    # plt.show()
    plt.savefig(prefix + ".6.png", dpi=300)
    plt.close()

In [5]:
path1 = "data/datasets/F20FTSCCKF1558_CIYgikE/GTS-E-14-1A/GTS-E-14-1A.polymerreads_dis.xls"
path2 = "data/datasets/F20FTSCCKF1558_CIYgikE/GTS-E-14-1A/GTS-E-14-1A_subreads_dis.xls"
prefix = "figures/length.dis.adult.female"
run(path1, path2, prefix)

In [7]:
path1 = "data/datasets/F20FTSCCKF0616_SERmjgE_SERsukE/WHPBCDNAPEP00000149/WHPBCDNAPEP00000149.polymerreads_dis.xls"
path2 = "data/datasets/F20FTSCCKF0616_SERmjgE_SERsukE/WHPBCDNAPEP00000149/WHPBCDNAPEP00000149_subreads_dis.xls"
prefix = "figures/length.dis.adult.male"
run(path1, path2, prefix)

In [8]:
path1 = "data/datasets/F20FTSCCKF0616_SERdweE_SERvxlE/WHPBCDNAPEP00000150/WHPBCDNAPEP00000150.polymerreads_dis.xls"
path2 = "data/datasets/F20FTSCCKF0616_SERdweE_SERvxlE/WHPBCDNAPEP00000150/WHPBCDNAPEP00000150_subreads_dis.xls"
prefix = "figures/length.dis.juvenile"
run(path1, path2, prefix)

In [11]:
# 2021-03-24 

In [16]:
from pyBioInfo.IO.File import GtfFile, GtfTranscriptBuilder, GtfGeneBuilder

In [36]:
path = "data/genome/annotation.gtf"
with GtfFile(path) as f:
    records = [x for x in f] 
genes = list(GtfGeneBuilder(records))
counter = Counter([len(gene.transcripts) for gene in genes])
# for i in range(0, 50):
#     print(i, counter[i])
print(np.mean([len(gene.transcripts) for gene in genes]))

1.458123399686132


In [34]:
path = "results/cupcake/chain/all_samples.chained.gtf.gz"
with GtfFile(path) as f:
    records = [x for x in f] 
genes = list(GtfGeneBuilder(records))
counter = Counter([len(gene.transcripts) for gene in genes])
for i in range(0, 50):
    print(i, counter[i])

0 0
1 12789
2 5582
3 3440
4 2372
5 1747
6 1427
7 1158
8 953
9 762
10 652
11 557
12 540
13 429
14 371
15 319
16 297
17 264
18 210
19 209
20 187
21 160
22 149
23 145
24 121
25 108
26 82
27 99
28 85
29 79
30 83
31 64
32 59
33 57
34 41
35 54
36 51
37 34
38 37
39 45
40 43
41 29
42 25
43 24
44 32
45 34
46 26
47 14
48 28
49 15


In [35]:
print(np.mean([len(gene.transcripts) for gene in genes]))

5.948429815644147


# 2021-03-24 CCS and lima report

In [315]:
for sample in ["Adult_Female", "Adult_Male", "Juvenile"]:
    path = "results/isoseq/ccs/%s.ccs_report.txt" % sample
    print(path)
    with open(path) as f:
        for line in f:
            if ":" not in line:
                continue
            v1, v2 = line.strip("\n").split(":")
            v1 = v1.strip()
            v2 = v2.strip()
            if v2 == "":
                continue
            print(v2)

results/isoseq/ccs/Adult_Female.ccs_report.txt
5759266
3971860 (68.96%)
1787406 (31.04%)
0 (0.00%)
10137 (0.18%)
107556 (6.02%)
0 (0.00%)
1426910 (79.83%)
16073 (0.90%)
1382 (0.08%)
184282 (10.31%)
1 (0.00%)
51179 (2.86%)
926 (0.05%)
23 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
results/isoseq/ccs/Adult_Male.ccs_report.txt
5215426
3509670 (67.29%)
1705756 (32.71%)
0 (0.00%)
11358 (0.22%)
184132 (10.79%)
0 (0.00%)
1310618 (76.84%)
14550 (0.85%)
1115 (0.07%)
156561 (9.18%)
0 (0.00%)
38760 (2.27%)
512 (0.03%)
20 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
results/isoseq/ccs/Juvenile.ccs_report.txt
6432352
4133860 (64.27%)
2298492 (35.73%)
0 (0.00%)
11022 (0.17%)
191504 (8.33%)
0 (0.00%)
1772906 (77.13%)
22851 (0.99%)
1758 (0.08%)
237562 (10.34%)
2 (0.00%)
71812 (3.12%)
937 (0.04%)
97 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)
0 (0.00%)


In [318]:
for sample in ["Adult_Female", "Adult_Male", "Juvenile"]:
    path = "results/isoseq/demux/%s.lima.summary" % sample
    print(path)
    with open(path) as f:
        for line in f:
            if ":" not in line:
                continue
            v1, v2 = line.strip("\n").split(":")
            v1 = v1.strip()
            v2 = v2.strip()
            if v2 == "":
                continue
            print(v2)

results/isoseq/demux/Adult_Female.lima.summary
3971860
2767549 (70%)
1204311 (30%)
68 (0%)
0 (0%)
219872 (18%)
61 (0%)
0 (0%)
259986 (22%)
61 (0%)
382455 (32%)
662511 (55%)
61 (0%)
2767549 (100%)
0%
3971799 (100%)
3971799 (100%)
2767549 (100%)
0 (0%)
results/isoseq/demux/Adult_Male.lima.summary
3509670
2364704 (67%)
1144966 (33%)
155 (0%)
0 (0%)
202707 (18%)
120 (0%)
0 (0%)
226676 (20%)
120 (0%)
395688 (35%)
609201 (53%)
120 (0%)
2364704 (100%)
0%
3509550 (100%)
3509550 (100%)
2364704 (100%)
0 (0%)
results/isoseq/demux/Juvenile.lima.summary
4133860
2821785 (68%)
1312075 (32%)
143 (0%)
0 (0%)
300944 (23%)
128 (0%)
0 (0%)
324391 (25%)
128 (0%)
460864 (35%)
641491 (49%)
128 (0%)
2821785 (100%)
0%
4133732 (100%)
4133732 (100%)
2821785 (100%)
0 (0%)


# A

In [2]:
path1 = "data/genome/transcript.bed"
path2 = "results/isoseq/collapsed/Adult_Male.bed.gz"

In [3]:
records1 = []
with BedFile(path1) as bed:
    for record in bed:
        records1.append(record)

In [4]:
records2 = []
with BedFile(path2) as bed:
    for record in bed:
        records2.append(record)

In [6]:
len(records1)

35307

In [7]:
len(records2)

116445

In [10]:
loader = ShiftLoader(records2)
m, n = 0, 0
for record1 in records1:
    flag = False
    for record2 in loader.fetch(obj=record1):
        if record2.strand == record1.strand:
            flag = True
            break
    if flag:
        m += 1
    else:
        n += 1
print(m, n)

23372 11935


In [11]:
loader = ShiftLoader(records1)
m, n = 0, 0
for record2 in records2:
    flag = False
    for record1 in loader.fetch(obj=record2):
        if record2.strand == record1.strand:
            flag = True
            break
    if flag:
        m += 1
    else:
        n += 1
print(m, n)

112799 3646


In [12]:
import matplotlib.pyplot as plt
from matplotlib_venn import *
from matplotlib import rcParams
rcParams["font.family"] = "arial"

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [17]:
plt.figure(figsize=(3, 3))
venn2([1, 1, 1], set_labels=["NCBI", "ISO-seq"])
plt.tight_layout()
# plt.show()
plt.savefig("venn2.pdf", dpi=300)
plt.close()

In [19]:
with gzip.open("results/prepare/fastq/Juvenile.fastq.gz", "rt") as f:
    for n, line in enumerate(f):
        line = line.strip("\n")
        if n == 5:
            s1 = line
            break

In [21]:
with open("Juvenile/read-long/Juvenile.fq") as f:
    for n, line in enumerate(f):
        line = line.strip("\n")
        if n == 5:
            s2 = line
            break

# 2021-03-25 Cupcake report

In [15]:
data = dict()
for sample in samples:
    data[sample] = dict()
    # raw
    path = "results/cupcake/collapsed/%s/out.collapsed.sorted.gtf.gz" % sample
    with GtfFile(path) as f:
        records = [x for x in f]
    genes = list(GtfGeneBuilder(records))
    data[sample]["raw"] = genes
    # filtered
    path = "results/cupcake/collapsed/%s/out.collapsed.filtered.sorted.gtf.gz" % sample
    with GtfFile(path) as f:
        records = [x for x in f]
    genes = list(GtfGeneBuilder(records))
    data[sample]["filtered"] = genes

In [18]:
rows = []
for sample in samples:
    values = []
    genes = data[sample]["raw"]
    values.append(len(genes))
    values.append(sum([len(g.transcripts) for g in genes]))
    exon_count = 0
    for g in genes:
        for t in g.transcripts:
            exon_count += len(t.blocks)
    values.append(exon_count)
    counts = [len(g.transcripts) for g in genes]
    values.append(np.mean(counts))
    values.append(np.median(counts))
    
    genes = data[sample]["filtered"]
    values.append(len(genes))
    values.append(sum([len(g.transcripts) for g in genes]))
    exon_count = 0
    for g in genes:
        for t in g.transcripts:
            exon_count += len(t.blocks)
    values.append(exon_count)
    counts = [len(g.transcripts) for g in genes]
    values.append(np.mean(counts))
    values.append(np.median(counts))
 
    rows.append(values)

In [22]:
columns = [
    "Gene_Raw", "Transcript_Raw", "Exon_Raw", "TranNum_Mean_Raw", "TranNum_Median_Raw",
    "Gene_Filtered", "Transcript_Filtered", "Exon_Filtered", "TranNum_Mean_Filtered", "TranNum_Median_Filtered"
]
dat = pd.DataFrame(rows, index=pd.Index(samples, name="Sample"), columns=columns)
dat.to_csv("cupcake.report.tsv", sep="\t")

In [26]:
dat["Transcript_Filtered"] / dat["Transcript_Raw"] * 100

Sample
Adult_Female    74.833222
Adult_Male      75.626835
Juvenile        69.308592
dtype: float64

In [47]:
plt.figure(figsize=(6, 3))
for i, sample in enumerate(samples):
    counts = [len(g.transcripts) for g in data[sample]["filtered"]]
    counter = Counter(counts)
    xs = np.arange(10)
    ys = [counter[x] for x in xs]
    ys.append(len(counts) - sum(ys))
    xs = np.arange(len(xs) + 1)
    ys = np.array(ys) / len(counts) * 100
    plt.bar(xs + (i - 1) * 0.2, ys, width=0.2, label=sample + " (%.2f)" % np.mean(counts))
plt.xlim(0.5, 10.5)
plt.xticks(np.arange(1, 11))
plt.xlabel("transcripts per gene")
plt.ylim(0, 50)
plt.ylabel("Percentage (%)")
plt.legend()
plt.tight_layout()
if False:
    plt.show()
else:
    plt.savefig("figures/cupcake.summary.png", dpi=300)
plt.close()