In [28]:
from collections import defaultdict
import numpy as np
import pandas as pd
from pyBioInfo.IO.File import GtfFile, GtfTranscriptBuilder

info = pd.read_excel("../../1_analysis_of_rna_seq/RNAseq.xls")
info = info.sort_values(by="Tissue")
info = info[~info["Replicate"].isna()]

# Summary of RNA-seq

In [23]:
tissues = list(sorted(set(info["Tissue"])))
rows = []
for t in tissues:
    row = []
    for sex in ["Mixture", "Male", "Female"]:
        for rep in [1, 2]:
            tmp = info[(info["Tissue"] == t) & (info["Sex"] == sex) & (info["Replicate"] == rep)]
            reads = 0
            mapped = 0
            if len(tmp) == 1:
                sample = tmp["Sample"].values[0]
                path = "../../1_analysis_of_rna_seq/results/denovo_mapping/star/mapped.2nd/%s/Log.final.out" % sample
                with open(path) as f:
                    for line in f:
                        if "Number of input reads" in line:
                            reads = line.strip("\n").split()[-1]
                        if "Uniquely mapped reads %" in line:
                            mapped = line.strip("\n").split()[-1]
            row.append(reads)
            row.append(mapped)
    rows.append(row)

In [24]:
dat = pd.DataFrame(rows, index=tissues)
dat.index.name = "Tissue"
dat.columns = [
    "Ju_Rep1_Reads", "Ju_Rep1_Mapped",
    "Ju_Rep2_Reads", "Ju_Rep2_Mapped",
    "Ma_Rep1_Reads", "Ma_Rep1_Mapped",
    "Ma_Rep2_Reads", "Ma_Rep2_Mapped",
    "Fe_Rep1_Reads", "Fe_Rep1_Mapped",
    "Fe_Rep2_Reads", "Fe_Rep2_Mapped",
]
dat.to_excel("results/summary_of_rna_seq.xlsx")

# Summary of Iso-seq (TODO)

# Summary of annotations

In [25]:
f_gtf_ncbi = "../../common/ncbi_Sdu_1.0/GCF_002260705.1_Sdu_1.0_genomic.clean.sorted.gtf.gz"
f_gtf_ensembl = "../../common/ensembl_Sdu_1.0/Seriola_dumerili.Sdu_1.0.103.converted.clean.sorted.gtf.gz"
f_gtf_ngs = "../../1_analysis_of_rna_seq/results/assembly/stringtie/merged_all_samples.sorted.gtf.gz"
f_gtf_tgs = "../../2_analysis_of_iso_seq/results/assembly/tama/filtered_internal_primer/all_samples.mp4.sorted.gtf.gz"
f_gtf_asm = "../../3_integrate_isoforms/results/assembly/asm.sorted.gtf.gz"

In [None]:
def load_gtf_summary(infile, name):
    with GtfFile(infile) as f:
        records = [x for x in f]  
    transcripts = list(GtfTranscriptBuilder(records))
    s = pd.Series(dtype=np.float)

    s.name = name
    s["isoforms"] = len(transcripts)

    counter = defaultdict(int)
    for t in transcripts:
        for k, v in t.records.items():
            gid = v[0].attributes["gene_id"]
            break
        counter[gid] += 1
    s["genes"] = len(counter)
    s["isoform_per_gene"] = s["isoforms"] / s["genes"]
    s["gene_with_multi_isoform"] = sum([v > 1 for v in counter.values()])
    s["gene_with_multi_isoform_perc"] = s["gene_with_multi_isoform"] / s["genes"]

    vs = []
    for t in transcripts:
        vs.append(len(t.blocks))
    s["exon_per_isoform"] = np.mean(vs)
    s["isoform_with_multi_exon"] = sum([v > 1 for v in vs])
    s["isoform_with_multi_exon_perc"] = s["isoform_with_multi_exon"] / s["isoforms"]

    lengths = list(sorted([len(t) for t in transcripts]))
    s["mean_length"] = np.mean(lengths)
    s["median_length"] = np.median(lengths)

    tmp1 = sum(lengths) / 2
    tmp2 = 0
    for length in lengths:
        tmp2 += length
        if tmp2 >= tmp1:
            n50 = length
            break
    s["n50"] = n50
    return s

s1 = load_gtf_summary(f_gtf_ncbi, "NCBI")
s2 = load_gtf_summary(f_gtf_ensembl, "ENSEMBL")
s3 = load_gtf_summary(f_gtf_ngs, "NGS")
s4 = load_gtf_summary(f_gtf_tgs, "TGS")
s5 = load_gtf_summary(f_gtf_asm, "Assembly")

In [31]:
dat = pd.DataFrame([s1, s2, s3, s4, s5])
dat.index.name = "Build"
dat

Unnamed: 0_level_0,isoforms,genes,isoform_per_gene,gene_with_multi_isoform,gene_with_multi_isoform_perc,exon_per_isoform,isoform_with_multi_exon,isoform_with_multi_exon_perc,mean_length,median_length,n50
Build,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NCBI,34971.0,23878.0,1.46457,6121.0,0.256345,11.574705,33051.0,0.945097,2748.624946,2333.0,3537.0
ENSEMBL,33717.0,23808.0,1.416205,6504.0,0.273185,10.531572,32323.0,0.958656,2553.637305,2004.0,3700.0
NGS,91901.0,30549.0,3.008315,14983.0,0.490458,13.11199,88629.0,0.964396,4542.42055,3945.0,6415.0
TGS,130734.0,17527.0,7.459006,11597.0,0.661665,16.861949,126052.0,0.964187,5300.739685,5458.0,5588.0
Assembly,147218.0,27087.0,5.435006,14353.0,0.529885,15.567397,139832.0,0.94983,4971.839395,5381.0,5563.0


In [33]:
dat.to_excel("results/summary_of_builds.xlsx")