In [3]:
import os
import subprocess
import glob
import pandas as pd

In [4]:
! mkdir -p upload/fastqs

In [11]:
# Make linkers for HG001

d = pd.read_csv("reports/NanoStrandSeq_HG001_QC_Pass.csv")
i = 1
for i, (run, cell) in enumerate(d[["Run", "Cell"]].values):
    path1 = "results/demux/trimmed/%s/%s/trimmed.fastq.gz" % (run, cell)
    assert os.path.exists(path1)
    path2 = "upload/fastqs/NanoStrandseq_GM12878_sc%04d.fastq.gz" % (i + 1)
    ! ln -f -s ../../{path1} {path2}

In [12]:
# Make linkers for B6D2F1

d = pd.read_csv("reports/NanoStrandSeq_C57DBA_QC_Pass.csv")
m, n = 1, 1
for run, cell, cellline in d[["Run", "Cell", "CellLine"]].values:
    cellline = cellline.strip()
    if cellline == "MEF":
        i = m
        m += 1
    else:
        i = n
        n += 1
    path1 = "results/demux/trimmed/%s/%s/trimmed.fastq.gz" % (run, cell)
    assert os.path.exists(path1)
    path2 = "upload/fastqs/NanoStrandseq_C57_6J_DBA_2N_%s_sc%04d.fastq.gz" % (cellline, i)
    ! ln -f -s ../../{path1} {path2}

In [15]:
# Make metadata spreedsheet

rows = []
for path in sorted(glob.glob("upload/fastqs/*.fastq.gz")):
    if "GM12878" in path:
        sample_name = "Human_GM12878"
        title = "GM12878"
        sample = "SAMN34371648"
    elif "epithelium" in path:
        sample_name = "Mouse_Epithelium"
        title = "mouse epithelium"
        sample = "SAMN34371647"
    else:
        sample_name = "Mouse_MEF"
        title = "mouse embryonic fibroblasts"
        sample = "SAMN34371646"
    title = "NanoStrand-seq for %s" % title
    lib_id = path.split("/")[-1][:-9]
    filename = path.split("/")[-1]
    rows.append([sample, sample_name, lib_id, title, filename])
dat = pd.DataFrame(rows)
dat.columns = ["biosample", "sample_name", "library_ID", "title", "filename"]
dat["LibraryStrategy"] = "WGS"
dat["LibrarySource"] = "GENOMIC SINGLE CELL"
dat["LibrarySelection"] = "PCR"
dat["LibraryLayout"] = "SINGLE"
dat["Platform"] = "OXFORD_NANOPORE"
dat["InstrumentModel"] = "PromethION"
dat["DesignDescription"] = "single cell template strand sequencing based on Nanopore platform"
dat["FileType"] = "fastq"
dat.to_csv("upload/sra_metadata.csv", index=False)