# dbnascent_initial_build

## This is a script for initially building dbnascent from metadata
### See dbnascent_update for updating existing database

### Load libraries

In [33]:
import os
import re
import shutil
import zipfile as zp
from statistics import median

import numpy as np
import sqlalchemy as sql
import yaml

# from . import utils #(in script)
# from . import orm #(in script)

### Import utilities (in notebook)

In [34]:
# %load utils.py
# utils.py --- Utilities for simplifying database code
#
# Filename: utils.py
# Description: Miscellaneous utilities for simplifying database code
# Author: Zachary Maas <zama8258@colorado.edu> and Lynn Sanford
# Maintainer: Lynn Sanford <lynn.sanford@colorado.edu>
# Created: Mon Jul  1 16:04:05 2019 (-0600)
#

# Commentary:
#
# This module contains a few helpful utility functions and classes for
# reducing the total amount of code needed for the database, since
# there are many areas where the same patterns keep popping up.
#

# Code:

import configparser
import csv
import os

import sqlalchemy as sql
from sqlalchemy.orm import sessionmaker


# Database Connection Handler
class NascentDBConnection:
    engine = None
    _Session = None
    session = None

    def __init__(self, db_url):
        self.engine = sql.create_engine(db_url, echo=False)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

    def __enter__(self):
        return self.session

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.session.commit()
        self.engine.dispose()


# Configuration File Reader
def load_config(filename: str):
    if not os.path.exists(filename):
        raise FileNotFoundError(
            "Configuration file does not exist at the provided path"
        )
    config = configparser.ConfigParser()
    with open(filename) as confFile:
        config.read_string(confFile.read())
    return config


# Add/update (?) tables in database
# (I'm not actually sure this updates if already existing)


def update_tables(db_url: str) -> None:
    engine = sql.create_engine("sqlite:///" + db_url, echo=False)
    Base.metadata.create_all(engine, checkfirst=True)


# Function for parsing table into list of dicts
def table_parse(table_filepath: str) -> list:
    """Parse table into list of dicts.

    Takes the manually curated metadata table as input and
    turns it into a list of dicts, one entry for each srr with
    key: value pairs for each column in the metadata table
    Output: List of dicts
    """
    # Check that the table file exists
    if not (os.path.exists(table_filepath) and os.path.isfile(table_filepath)):
        raise FileNotFoundError(f"{table_filepath} does not exist.")

    # Load in file as a list of dicts
    table_list = []
    with open(table_filepath, newline="") as tab:
        full_table = csv.DictReader(tab, delimiter="\t")
        for entry in full_table:
            table_list.append(dict(entry))

    return table_list


# Function for grabbing specific keys
def key_grab(table_list, key_list) -> list:
    """Grab specific key values.

    Takes list of dicts and a list of keys and
    extracts specific values to a list for inputting into database
    Output: List of values corresponding to input keys for each
    table entry
    """
    # Load in file as a list of dicts
    value_list = []
    for entry in table_list:
        value_subset = []
        for i in range(len(key_list)):
            value_subset.append(entry[key_list[i]])
        value_list.append(value_subset)

    return value_list


def get_unique_table(location_key, column_keys) -> dict:
    filepath = config["file_locations"][location_key]
    full_table_dict = table_parse(filepath)

    full_table_list = np.array(key_grab(full_table_dict, column_keys))
    unique_list = np.unique(full_table_list, axis=0)

    unique_table = []
    for i in range(len(unique_list)):
        entry = dict(zip(column_keys, unique_list[i]))
        unique_table.append(entry)

    return unique_table


def value_compare(db_row, metatable_row, key_dict) -> bool:
    for key in key_dict:
        if db_row[key] == metatable_row[key_dict[key]]:
            continue
        else:
            return 0
    return 1


def object_as_dict(obj):
    return {c.key: getattr(obj, c.key) for c in sql.inspect(obj).mapper.column_attrs}


#
# utils.py ends here

### Import ORM (in notebook)

In [126]:
# %load orm.py
# orm.py --- ORM for DBNascent
#
# Filename: orm.py
# Description: ORM for DBNascent
# Authors: Zach Maas and Lynn Sanford
# Maintainer: Lynn Sanford <lynn.sanford@colorado.edu>
# Created: Mon Jun 10 13:11:55 2019 (-0600)
# URL:
#

# Commentary:
#
# This file contains code for an ORM to interface with the Dowell
# Lab's Nascent Database.
#

# Code:

import sqlalchemy as sql
from sqlalchemy.ext.declarative import declarative_base

# Base class for our ORM
Base = declarative_base()


# MAIN TABLES
class organismInfo(Base):
    __tablename__ = "organismInfo"
#    metadata = MetaData()
    organism = sql.Column(
        sql.String(length=127), primary_key=True, index=True, unique=True
    )
    genome_build = sql.Column(sql.String(length=50))
    genome_bases = sql.Column(sql.Integer)

class searchEq(Base):
    __tablename__ = "searchEq"
#    metadata = MetaData()
    search_term = sql.Column(
        sql.String(length=250), primary_key=True, index=True, unique=True
    )
    db_term = sql.Column(sql.String(length=127))

class exptMetadata(Base):
    __tablename__ = "exptMetadata"
#    metadata = MetaData()
    expt_id = sql.Column(sql.Integer,
                         primary_key=True,
                         index=True,
                         unique=True)
    srp = sql.Column(sql.String(length=50))
    protocol = sql.Column(sql.String(length=50))
    organism = sql.Column(
        sql.String(length=127), sql.ForeignKey("organismInfo.organism")
    )
    library = sql.Column(sql.String(length=50))
    spikein = sql.Column(sql.String(length=127))
    paper_id = sql.Column(sql.String(length=127))
    published = sql.Column(sql.Boolean)
    year = sql.Column(sql.Integer)
    first_author = sql.Column(sql.String(length=127))
    last_author = sql.Column(sql.String(length=127))
    doi = sql.Column(sql.String(length=300))
    curator1 = sql.Column(sql.String(length=50))
    curator2 = sql.Column(sql.String(length=50))
    other_sr_data = sql.Column(sql.Boolean)
    atac_seq = sql.Column(sql.Boolean)
    rna_seq = sql.Column(sql.Boolean)
    chip_seq = sql.Column(sql.Boolean)
    three_dim_seq = sql.Column(sql.Boolean)
    other_seq = sql.Column(sql.Boolean)
    paper_qc_score = sql.Column(sql.Float)
    paper_data_score = sql.Column(sql.Float)


class sampleID(Base):
    __tablename__ = "sampleID"
#    metadata = MetaData()
    srr = sql.Column(sql.String(length=50),
                     primary_key=True,
                     index=True,
                     unique=True)
    sample_name = sql.Column(sql.String(length=50))
    sample_id = sql.Column(sql.Integer)


class geneticInfo(Base):
    __tablename__ = "geneticInfo"
#    metadata = MetaData()
    genetic_id = sql.Column(sql.Integer,
                            primary_key=True,
                            index=True,
                            unique=True)
    organism = sql.Column(
        sql.String(length=127), sql.ForeignKey("organismInfo.organism")
    )
    sample_type = sql.Column(sql.String(length=127))
    cell_type = sql.Column(sql.String(length=127))
    clone_individual = sql.Column(sql.String(length=127))
    strain = sql.Column(sql.String(length=127))
    genotype = sql.Column(sql.String(length=127))
    construct = sql.Column(sql.String(length=127))


class conditionInfo(Base):
    __tablename__ = "conditionInfo"
#    metadata = MetaData()
    condition_id = sql.Column(sql.Integer,
                              primary_key=True,
                              index=True,
                              unique=True)
    condition_type = sql.Column(sql.String(length=127))
    treatment = sql.Column(sql.String(length=127))
    conc_intens = sql.Column(sql.String(length=50))
    start_time = sql.Column(sql.Integer)
    end_time = sql.Column(sql.Integer)
    duration = sql.Column(sql.Integer)
    time_unit = sql.Column(sql.String(length=50))
    duration_unit = sql.Column(sql.String(length=50))


exptCondition = sql.Table(
    "exptCondition",
    Base.metadata,
    sql.Column("sample_id",
               sql.Integer,
               sql.ForeignKey("sampleID.sample_id")),
    sql.Column("condition_id",
               sql.Integer,
               sql.ForeignKey("conditionInfo.condition_id")),
)


class linkIDs(Base):
    __tablename__ = "linkIDs"
#    metadata = MetaData()
    sample_id = sql.Column(
        sql.Integer,
        sql.ForeignKey("sampleID.sample_id"),
        primary_key=True,
        index=True,
        unique=True,
    )
    genetic_id = sql.Column(sql.Integer,
                            sql.ForeignKey("geneticInfo.genetic_id"))
    expt_id = sql.Column(sql.Integer,
                         sql.ForeignKey("exptMetadata.expt_id"))


class sampleAccum(Base):
    __tablename__ = "sampleAccum"
#    metadata = MetaData()
    sample_id = sql.Column(
        sql.Integer,
        sql.ForeignKey("sampleID.sample_id"),
        primary_key=True,
        index=True,
        unique=True,
    )
    replicate = sql.Column(sql.Integer)
    single_paired = sql.Column(sql.String(length=50))
    rcomp = sql.Column(sql.Boolean)
    expt_unusable = sql.Column(sql.Boolean)
    timecourse = sql.Column(sql.Boolean)
    baseline_control_expt = sql.Column(sql.String(length=50))
    notes = sql.Column(sql.String(length=300))
    raw_read_depth = sql.Column(sql.Integer)
    trim_read_depth = sql.Column(sql.Integer)
    raw_read_length = sql.Column(sql.Integer)
    duplication_picard = sql.Column(sql.Float)
    single_map = sql.Column(sql.Integer)
    multi_map = sql.Column(sql.Integer)
    map_prop = sql.Column(sql.Float)
    rseqc_tags = sql.Column(sql.Integer)
    rseqc_cds = sql.Column(sql.Integer)
    rseqc_five_utr = sql.Column(sql.Integer)
    rseqc_three_utr = sql.Column(sql.Integer)
    rseqc_intron = sql.Column(sql.Integer)
    cds_rpk = sql.Column(sql.Float)
    intron_rpk = sql.Column(sql.Float)
    exint_ratio = sql.Column(sql.Float)
    distinct_tenmillion_prop = sql.Column(sql.Float)
    genome_prop_cov = sql.Column(sql.Float)
    avg_fold_cov = sql.Column(sql.Float)
    samp_qc_score = sql.Column(sql.Integer)
    samp_data_score = sql.Column(sql.Integer)


class nascentflowMetadata(Base):
    __tablename__ = "nascentflowMetadata"
#    metadata = MetaData()
    nascentflow_version_id = sql.Column(
        sql.Integer, primary_key=True, index=True, unique=True
    )
    sample_id = sql.Column(sql.Integer, sql.ForeignKey("sampleID.sample_id"))
    nascentflow_version = sql.Column(sql.String(length=127))
    pipeline_revision_hash = sql.Column(sql.String(length=127))
    pipeline_hash = sql.Column(sql.String(length=127))
    nascentflow_date = sql.Column(sql.Date)
    nascentflow_redo_date = sql.Column(sql.Date)
    nextflow_version = sql.Column(sql.String(length=127))
    fastqc_version = sql.Column(sql.String(length=127))
    bbmap_version = sql.Column(sql.String(length=127))
    hisat2_version = sql.Column(sql.String(length=127))
    samtools_version = sql.Column(sql.String(length=127))
    sratools_version = sql.Column(sql.String(length=127))
    preseq_version = sql.Column(sql.String(length=127))
    preseq_date = sql.Column(sql.Date)
    rseqc_version = sql.Column(sql.String(length=127))
    rseqc_date = sql.Column(sql.Date)
    java_version = sql.Column(sql.String(length=127))
    picard_gc_version = sql.Column(sql.String(length=127))
    picard_dups_version = sql.Column(sql.String(length=127))
    picard_date = sql.Column(sql.Date)
    bedtools_version = sql.Column(sql.String(length=127))
    igvtools_version = sql.Column(sql.String(length=127))
    seqkit_version = sql.Column(sql.String(length=127))
    mpich_version = sql.Column(sql.String(length=127))
    gcc_version = sql.Column(sql.String(length=127))
    python_version = sql.Column(sql.String(length=127))
    numpy_version = sql.Column(sql.String(length=127))


class bidirflowMetadata(Base):
    __tablename__ = "bidirflowMetadata"
#    metadata = MetaData()
    bidirflow_version_id = sql.Column(
        sql.Integer, primary_key=True, index=True, unique=True
    )
    sample_id = sql.Column(sql.Integer, sql.ForeignKey("sampleID.sample_id"))
    bidirflow_version = sql.Column(sql.String(length=127))
    pipeline_revision_hash = sql.Column(sql.String(length=127))
    pipeline_hash = sql.Column(sql.String(length=127))
    bidirflow_date = sql.Column(sql.Date)
    nextflow_version = sql.Column(sql.String(length=127))
    samtools_version = sql.Column(sql.String(length=127))
    bedtools_version = sql.Column(sql.String(length=127))
    mpich_version = sql.Column(sql.String(length=127))
    openmpi_version = sql.Column(sql.String(length=127))
    gcc_version = sql.Column(sql.String(length=127))
    r_version = sql.Column(sql.String(length=127))
    rsubread_version = sql.Column(sql.String(length=127))
    boost_version = sql.Column(sql.String(length=127))
    fstitch_version = sql.Column(sql.String(length=127))
    tfit_version = sql.Column(sql.String(length=127))
    dreg_version = sql.Column(sql.String(length=127))
    dreg_date = sql.Column(sql.Date)
    tfit_date = sql.Column(sql.Date)
    fcgene_date = sql.Column(sql.Date)


# The following were created by Zach and we may or may not use...

# class tf(Base):
#    __tablename__ = "tf"
#    tf_id = sql.Column(sql.String(length=127), primary_key=True)
#    tf_alias = sql.Column(sql.String(length=127))


# class pipeline_status(Base):
#    __tablename__ = "pipeline_status"
#    srr_id = sql.Column(
#        sql.String(length=127),
#        sql.ForeignKey("srr_metadata.srr_id"),
#        primary_key=True,
#    )
#    fastqc_complete = sql.Column(sql.Boolean)
#    bbduk_complete = sql.Column(sql.Boolean)
#    hisat2_complete = sql.Column(sql.Boolean)
#    samtools_complete = sql.Column(sql.Boolean)
#    fastq_dump_complete = sql.Column(sql.Boolean)
#    pileup_complete = sql.Column(sql.String(length=127))
#    preseq_complete = sql.Column(sql.Boolean)
#    rseqc_complete = sql.Column(sql.String(length=127))
#    bedtools_complete = sql.Column(sql.Boolean)
#    igv_tools_complete = sql.Column(sql.Boolean)
#    fstitch_complete = sql.Column(sql.Boolean)
#    tfit_complete = sql.Column(sql.Boolean)


# class md_score(Base):
#    __tablename__ = "md_score"
#    srr_id = sql.Column(
#        sql.String(length=127),
#        sql.ForeignKey("srr_metadata.srr_id"),
#        primary_key=True,
#    )
#    tf_id = sql.Column(sql.String, sql.ForeignKey("tf.tf_id"))
#    erna_type = sql.Column(sql.String(length=127))
#    md_score_expected = sql.Column(sql.Integer)
#    md_score_std = sql.Column(sql.Integer)


# orm.py ends here

### Load config file

In [127]:
# config = utils.load_config("/home/lsanford/Documents/data/repositories/dbnascent_build/config.txt")
config = load_config(
    "/home/lsanford/Documents/data/repositories/DBNascent-build/config.txt"
)

### Create tables in database if they don't exist

In [128]:
db_url = config["file_locations"]["database"]

# utils.update_tables(db_url)
update_tables(db_url)

### Parse pre-created genome table and load into DB

In [129]:
organism_keys = list(dict(config["organism keys"]).values())

# organism_table = utils.get_unique_table("organism_table",organism_keys)
organism_table = get_unique_table("organism_table", organism_keys)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(organism_table)):
        entry = organismInfo(
            organism=organism_table[i][config["organism keys"]["organism"]],
            genome_build=organism_table[i][config["organism keys"]["genome_build"]],
            genome_bases=organism_table[i][config["organism keys"]["genome_bases"]],
        )
        session.merge(entry)

### Parse pre-created search equivalencies table and load into DB

In [130]:
searcheq_keys = list(dict(config["searcheq keys"]).values())

# searcheq_table = utils.get_unique_table("searcheq_table",searcheq_keys)
searcheq_table = get_unique_table("searcheq_table", searcheq_keys)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(searcheq_table)):
        entry = searchEq(
            search_term=searcheq_table[i][config["searcheq keys"]["search_term"]],
            db_term=searcheq_table[i][config["searcheq keys"]["db_term"]],
        )
        session.merge(entry)

### Parse metadata table and load exptMetadata values into DB, creating expt_id

In [131]:
expt_keys = list(dict(config["expt keys"]).values())

# expt_table = utils.get_unique_table("metadata",expt_keys)
expt_table = get_unique_table("metadata", expt_keys)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(expt_table)):
        entry = exptMetadata(
            srp=expt_table[i][config["expt keys"]["srp"]],
            protocol=expt_table[i][config["expt keys"]["protocol"]],
            organism=expt_table[i][config["expt keys"]["organism"]],
            library=expt_table[i][config["expt keys"]["library"]],
            spikein=expt_table[i][config["expt keys"]["spikein"]],
            paper_id=expt_table[i][config["expt keys"]["paper_id"]],
            published=int(expt_table[i][config["expt keys"]["published"]]),
            year=expt_table[i][config["expt keys"]["year"]],
            first_author=expt_table[i][config["expt keys"]["first_author"]],
            last_author=expt_table[i][config["expt keys"]["last_author"]],
            doi=expt_table[i][config["expt keys"]["doi"]],
            curator1=expt_table[i][config["expt keys"]["curator1"]],
            curator2=expt_table[i][config["expt keys"]["curator2"]],
            other_sr_data=int(expt_table[i][config["expt keys"]["other_sr_data"]]),
            atac_seq=int(expt_table[i][config["expt keys"]["atac_seq"]]),
            rna_seq=int(expt_table[i][config["expt keys"]["rna_seq"]]),
            chip_seq=int(expt_table[i][config["expt keys"]["chip_seq"]]),
            three_dim_seq=int(expt_table[i][config["expt keys"]["three_dim_seq"]]),
            other_seq=int(expt_table[i][config["expt keys"]["other_seq"]]),
        )
        session.merge(entry)

### Parse metadata table and load sampleID values into DB, creating sample_id

In [132]:
sample_keys = list(dict(config["sample keys"]).values())

# sample_table = utils.get_unique_table("metadata",sample_keys)
sample_table = get_unique_table("metadata", sample_keys)

# metatable = utils.table_parse(config["file_locations"]["metadata"])
# srz_list = np.array(utils.key_grab(metatable, [config["sample keys"]["sample_name"]]))
metatable = table_parse(config["file_locations"]["metadata"])
srz_list = np.array(key_grab(metatable, [config["sample keys"]["sample_name"]]))
srz_list = np.unique(srz_list[srz_list != ""])
srz_table = dict(zip(srz_list, list(range(1, len(srz_list) + 1))))

z = len(srz_table) + 1
for i in range(len(sample_table)):
    if sample_table[i][config["sample keys"]["sample_name"]] == "":
        sample_table[i]["sample_id"] = z
        sample_table[i][config["sample keys"]["sample_name"]] = sample_table[i][
            config["sample keys"]["srr"]
        ]
        z = z + 1
    else:
        sample_table[i]["sample_id"] = srz_table[
            sample_table[i][config["sample keys"]["sample_name"]]
        ]

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(sample_table)):
        entry = sampleID(
            sample_id=sample_table[i]["sample_id"],
            srr=sample_table[i][config["sample keys"]["srr"]],
            sample_name=sample_table[i][config["sample keys"]["sample_name"]],
        )
        session.merge(entry)

### Parse metadata table and load geneticInfo values into DB

In [133]:
genetic_keys = list(dict(config["genetic keys"]).values())

# genetic_table = utils.get_unique_table("metadata",genetic_keys)
genetic_table = get_unique_table("metadata", genetic_keys)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(genetic_table)):
        genetic = geneticInfo(
            organism=genetic_table[i][config["genetic keys"]["organism"]],
            sample_type=genetic_table[i][config["genetic keys"]["sample_type"]],
            cell_type=genetic_table[i][config["genetic keys"]["cell_type"]],
            clone_individual=genetic_table[i][
                config["genetic keys"]["clone_individual"]
            ],
            strain=genetic_table[i][config["genetic keys"]["strain"]],
            genotype=genetic_table[i][config["genetic keys"]["genotype"]],
            construct=genetic_table[i][config["genetic keys"]["construct"]],
        )
        session.merge(genetic)

### Parse metadata table and make conditions table

In [134]:
meta_condition_keys = list(dict(config["metatable condition keys"]).values())
meta_condition_keys.append("srr")

metatable = table_parse(config["file_locations"]["metadata"])
meta_condition_table = np.array(key_grab(metatable, meta_condition_keys))

condition_table = [
    [
        "srr",
        "condition_type",
        "treatment",
        "conc_intens",
        "start_time",
        "end_time",
        "time_unit",
    ]
]
for i in range(len(meta_condition_table)):
    srr = meta_condition_table[i][3]
    if ";" in meta_condition_table[i][0]:
        treatment_types = meta_condition_table[i][0].split(";")
        treatments = meta_condition_table[i][1].split(";")
        unparsed_times = meta_condition_table[i][2].split(";")

        for j in range(len(treatment_types)):
            if "(" in treatments[j]:
                conc_int = treatments[j].split("(")[1]
                conc_int = conc_int.split(")")[0]
                new_treatment = treatments[j].split("(")[0]
            else:
                conc_int = "NA or not known"
                new_treatment = treatments[j]

            if len(unparsed_times[j]) > 0:
                start = unparsed_times[j].split(",")[0]
                end = unparsed_times[j].split(",")[1]
                time_unit = unparsed_times[j].split(",")[2]
            else:
                start = ""
                end = ""
                time_unit = ""

            entry = [
                srr,
                treatment_types[j],
                new_treatment,
                conc_int,
                start,
                end,
                time_unit,
            ]
            condition_table.append(entry)

    else:
        treatment_types = meta_condition_table[i][0]
        treatments = meta_condition_table[i][1]
        unparsed_times = meta_condition_table[i][2]

        if len(treatments) > 0:
            if "(" in treatments:
                conc_int = treatments.split("(")[1]
                conc_int = conc_int.split(")")[0]
                new_treatment = treatments.split("(")[0]
            else:
                conc_int = "NA or not known"
                new_treatment = treatments
        else:
            conc_int = ""
            new_treatment = treatments

        if len(unparsed_times) > 0:
            start = unparsed_times.split(",")[0]
            end = unparsed_times.split(",")[1]
            time_unit = unparsed_times.split(",")[2]
        else:
            start = ""
            end = ""
            time_unit = ""

        entry = entry = [
            srr,
            treatment_types,
            new_treatment,
            conc_int,
            start,
            end,
            time_unit,
        ]
        condition_table.append(entry)

# Write out condition table
outfile = config["file_locations"]["conditions"]
with open(outfile, "w", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerows(condition_table)

### Load conditionInfo values into DB

In [135]:
condition_keys = list(dict(config["condition keys"]).values())

# condition_table = utils.get_unique_table("conditions",condition_keys)
condition_table = get_unique_table("conditions", condition_keys)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(condition_table)):
        if not condition_table[i]["end_time"]:
            duration = ""
        else:
            duration = int(condition_table[i]["end_time"]) - int(
                condition_table[i]["start_time"]
            )

        if time_unit == "s":
            if duration % 60 == 0:
                if duration % 3600 == 0:
                    if duration % 86400 == 0:
                        duration = duration / 86400
                        duration_unit = "day"
                    else:
                        duration = duration / 3600
                        duration_unit = "hr"
                else:
                    duration = duration / 60
                    duration_unit = "min"
            else:
                duration_unit = "s"
        elif time_unit == "min":
            if duration % 60 == 0:
                if duration % 1440 == 0:
                    duration = duration / 1440
                    duration_unit = "day"
                else:
                    duration = duration / 60
                    duration_unit = "hr"
            else:
                duration_unit = "min"
        elif time_unit == "hr":
            if duration % 24 == 0:
                duration = duration / 24
                duration_unit = "day"
            else:
                duration_unit = "hr"
        else:
            duration_unit = "day"

        condition = conditionInfo(
            condition_type=condition_table[i][
                config["condition keys"]["condition_type"]
            ],
            treatment=condition_table[i][config["condition keys"]["treatment"]],
            conc_intens=condition_table[i][config["condition keys"]["conc_intens"]],
            start_time=condition_table[i][config["condition keys"]["start_time"]],
            end_time=condition_table[i][config["condition keys"]["end_time"]],
            time_unit=condition_table[i][config["condition keys"]["time_unit"]],
            duration=duration,
            duration_unit=duration_unit,
        )
        session.merge(condition)

### Create exptCondition equivalencies in DB

In [136]:
# Pull condition info including ID from database and make a unique hash for conditions

condition_id = []
condition_details = []

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in session.query(conditionInfo).all():
        condition_id.append(row.condition_id)
        condition_details.append(
            "".join(
                [
                    str(row.condition_type),
                    str(row.treatment),
                    str(row.conc_intens),
                    str(row.start_time),
                    str(row.end_time),
                    str(row.time_unit),
                ]
            )
        )

condition_dict = dict(zip(condition_details, condition_id))

# Pull sample ID from database for each SRR

srr = []
sample_id = []

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in session.query(sampleID).all():
        srr.append(row.srr)
        sample_id.append(row.sample_id)

sample_dict = dict(zip(srr, sample_id))

# Grab condition table including SRRs and make SRR/condition hash

condition_keys = list(dict(config["condition keys"]).values())
condition_keys.append("srr")
# condition_table = utils.table_parse(config["file_locations"]["conditions"])
# cond_str = utils.key_grab(condition_table,condition_keys)
condition_table = table_parse(config["file_locations"]["conditions"])
cond_str = key_grab(condition_table, condition_keys)

srr_cond = []
for i in range(len(cond_str)):
    srr_cond.append([cond_str[i][-1], "".join(cond_str[i][0:-1])])

srr_cond = np.unique(np.array(srr_cond), axis=0)

# Make sample ID/condition ID table
sample_condition = []
for i in range(len(srr_cond)):
    sample_id = sample_dict[srr_cond[i][0]]
    condition = condition_dict[srr_cond[i][1]]
    sample_condition.append([sample_id, condition])

sample_condition = np.unique(np.array(sample_condition), axis=0)

# Add to database

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(sample_condition)):
        statement = exptCondition.insert().values(
            sample_id=int(sample_condition[i][0]),
            condition_id=int(sample_condition[i][1]),
        )
        session.execute(statement)
        session.commit()

### Build sampleAccum table and linker table

#### Add relevant DB keys to table in correct rows

In [137]:
# metatable = utils.table_parse(config["file_locations"]["metadata"])
metatable = table_parse(config["file_locations"]["metadata"])

# Do some data massaging
for i in range(len(metatable)):
    metatable[i]["year"] = int(metatable[i]["year"])
    metatable[i]["replicate"] = metatable[i]["replicate"][0:4]
    if not metatable[i][config["sample keys"]["sample_name"]]:
        metatable[i][config["sample keys"]["sample_name"]] = metatable[i][
            config["sample keys"]["srr"]
        ]
    for key in metatable[i]:
        if metatable[i][key] == "0":
            metatable[i][key] = False
        elif metatable[i][key] == "1":
            metatable[i][key] = True

# Add sample id
sample_keys = dict(config["sample keys"])
# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in session.query(sampleID).all():
        db_row = object_as_dict(row)
        for i in range(len(metatable)):
            if value_compare(db_row, metatable[i], sample_keys):
                metatable[i]["sample_id"] = row.sample_id
                metatable[i]["sample_name"] = row.sample_name

# Add genetic id
genetic_keys = dict(config["genetic keys"])
# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in session.query(geneticInfo).all():
        db_row = object_as_dict(row)
        for i in range(len(metatable)):
            if value_compare(db_row, metatable[i], genetic_keys):
                metatable[i]["genetic_id"] = row.genetic_id

# Add experimental id
expt_keys = dict(config["expt keys"])
# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in session.query(exptMetadata).all():
        db_row = object_as_dict(row)
        for i in range(len(metatable)):
            if value_compare(db_row, metatable[i], expt_keys):
                metatable[i]["expt_id"] = row.expt_id

# Collapse table to unique values based on sample_id (combines SRZs, essentially)
metatable = list({v["sample_id"]: v for v in metatable}.values())

#### Make linkIDs table

In [138]:
# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(metatable)):
        entry = linkIDs(
            sample_id=metatable[i]["sample_id"],
            genetic_id=metatable[i]["genetic_id"],
            expt_id=metatable[i]["expt_id"],
        )
        session.merge(entry)

#### Load database data location

In [139]:
data_path = config["file_locations"]["data"]

#### Scrape FastQC data

In [140]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/fastqc/zips/"

    if metatable[i][config["accum keys"]["single_paired"]] == "paired":
        samp_zip = dirpath + sample + "_1_fastqc"
    else:
        samp_zip = dirpath + sample + "_fastqc"

    if not (os.path.exists(samp_zip + ".zip")):
        metatable[i]["raw_read_depth"] = None
        metatable[i]["raw_read_length"] = None
        metatable[i]["trim_read_depth"] = None
        continue

    with zp.ZipFile(samp_zip + ".zip", "r") as zp_ref:
        zp_ref.extractall(dirpath)

    fdata = open(samp_zip + "/fastqc_data.txt")
    for line in fdata:
        if re.compile("Total Sequences").search(line):
            metatable[i]["raw_read_depth"] = int(line.split()[2])
        if re.compile("Sequence length").search(line):
            metatable[i]["raw_read_length"] = int(line.split()[2].split("-")[0])

    shutil.rmtree(samp_zip)

    if metatable[i][config["accum keys"]["rcomp"]] == 1:
        if metatable[i][config["accum keys"]["single_paired"]] == "paired":
            samp_zip = dirpath + sample + "_1.flip.trim_fastqc"
        else:
            samp_zip = dirpath + sample + ".flip.trim_fastqc"
    else:
        if metatable[i][config["accum keys"]["single_paired"]] == "paired":
            samp_zip = dirpath + sample + "_1.trim_fastqc"
        else:
            samp_zip = dirpath + sample + ".trim_fastqc"

    with zp.ZipFile(samp_zip + ".zip", "r") as zp_ref:
        zp_ref.extractall(dirpath)

    fdata = open(samp_zip + "/fastqc_data.txt")
    for line in fdata:
        if re.compile("Total Sequences").search(line):
            metatable[i]["trim_read_depth"] = int(line.split()[2])

    shutil.rmtree(samp_zip)

#### Scrape picardtools data

In [141]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/picard/dups/"
    filepath = dirpath + sample + ".marked_dup_metrics.txt"

    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        metatable[i]["duplication_picard"] = None
        continue

    fdata = open(filepath)
    for line in fdata:
        if re.compile("Unknown Library").search(line):
            metatable[i]["duplication_picard"] = float(line.split("\t")[8])

#### Scrape mapping data

In [142]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/hisat2_mapstats/"
    filepath = dirpath + sample + ".hisat2_mapstats.txt"

    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        metatable[i]["single_map"] = None
        metatable[i]["multi_map"] = None
        metatable[i]["map_prop"] = None
        continue

    fdata = open(filepath)
    if metatable[i][config["accum keys"]["single_paired"]] == "paired":
        for line in fdata:
            if re.compile("concordantly 1 time").search(line):
                reads = int(line.split(": ")[1].split(" (")[0]) * 2
            if re.compile("Aligned 1 time").search(line):
                metatable[i]["single_map"] = reads + int(
                    line.split(": ")[1].split(" (")[0]
                )
            if re.compile("concordantly >1 times").search(line):
                reads = int(line.split(": ")[1].split(" (")[0]) * 2
            if re.compile("Aligned >1 times").search(line):
                metatable[i]["multi_map"] = reads + int(
                    line.split(": ")[1].split(" (")[0]
                )
            if re.compile("Overall alignment rate").search(line):
                metatable[i]["map_prop"] = (
                    float(line.split(": ")[1].split("%")[0]) / 100
                )
    else:
        for line in fdata:
            if re.compile("Aligned 1 time").search(line):
                metatable[i]["single_map"] = int(line.split(": ")[1].split(" (")[0])
            if re.compile("Aligned >1 times").search(line):
                metatable[i]["multi_map"] = int(line.split(": ")[1].split(" (")[0])
            if re.compile("Overall alignment rate").search(line):
                metatable[i]["map_prop"] = (
                    float(line.split(": ")[1].split("%")[0]) / 100
                )

#### Scrape rseqc data

In [143]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/rseqc/read_distribution/"
    filepath = dirpath + sample + ".read_distribution.txt"

    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        metatable[i]["rseqc_tags"] = None
        metatable[i]["rseqc_cds"] = None
        metatable[i]["cds_rpk"] = None
        metatable[i]["rseqc_five_utr"] = None
        metatable[i]["rseqc_three_utr"] = None
        metatable[i]["rseqc_intron"] = None
        metatable[i]["intron_rpk"] = None
        metatable[i]["exint_ratio"] = None
        continue

    fdata = open(filepath)
    for line in fdata:
        if re.compile("Total Assigned Tags").search(line):
            metatable[i]["rseqc_tags"] = int(line.split()[-1])
        if re.compile("CDS_Exons").search(line):
            metatable[i]["rseqc_cds"] = int(line.split()[2])
            metatable[i]["cds_rpk"] = float(line.split()[-1])
        if re.compile("5'UTR_Exons").search(line):
            metatable[i]["rseqc_five_utr"] = int(line.split()[2])
        if re.compile("3'UTR_Exons").search(line):
            metatable[i]["rseqc_three_utr"] = int(line.split()[2])
        if re.compile("Introns").search(line):
            metatable[i]["rseqc_intron"] = int(line.split()[2])
            metatable[i]["intron_rpk"] = float(line.split()[-1])

    if metatable[i]["intron_rpk"] > 0:
        metatable[i]["exint_ratio"] = (
            metatable[i]["cds_rpk"] / metatable[i]["intron_rpk"]
        )
    else:
        metatable[i]["exint_ratio"] = None

#### Pull preseq data

In [144]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/preseq/"
    filepath = dirpath + sample + ".lc_extrap.txt"

    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        metatable[i]["distinct_tenmillion_prop"] = None
        continue

    fdata = open(filepath)
    for line in fdata:
        if line.startswith("10000000.0"):
            distinct = float(line.split()[1])

    metatable[i]["distinct_tenmillion_prop"] = distinct / 10000000

#### Pull pileup data

In [145]:
for i in range(len(metatable)):
    #    paper_id = metatable[i][config["expt keys"]["paper_id"]]
    paper_id = ""
    sample = metatable[i]["sample_name"]
    dirpath = data_path + paper_id + "/qc/pileup/"
    filepath = dirpath + sample + ".coverage.stats.txt"

    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        metatable[i]["genome_prop_cov"] = None
        metatable[i]["avg_fold_cov"] = None
        continue

    fdata = open(filepath)
    x = 0
    total = cov = fold = 0
    for line in fdata:
        if x == 0:
            x = x + 1
            continue
        else:
            x = x + 1
            total = total + int(line.split("\t")[2])
            cov = cov + int(line.split("\t")[5])
            fold = fold + float(line.split("\t")[1]) * int(line.split("\t")[2])

    metatable[i]["genome_prop_cov"] = cov / total
    metatable[i]["avg_fold_cov"] = fold / total

#### Calculate qc/data scores

In [146]:
for i in range(len(metatable)):
    if (
        metatable[i]["trim_read_depth"] == None
        or metatable[i]["duplication_picard"] == None
        or metatable[i]["map_prop"] == None
        or metatable[i]["distinct_tenmillion_prop"] == None
    ):
        metatable[i]["samp_qc_score"] = 0
    elif (
        (metatable[i]["trim_read_depth"] <= 5000000)
        or (metatable[i]["duplication_picard"] >= 0.95)
        or (metatable[i]["map_prop"] * metatable[i]["trim_read_depth"] <= 4000000)
        or (metatable[i]["distinct_tenmillion_prop"] < 0.05)
    ):
        metatable[i]["samp_qc_score"] = 5
    elif (
        (metatable[i]["trim_read_depth"] <= 10000000)
        or (metatable[i]["duplication_picard"] >= 0.80)
        or (metatable[i]["map_prop"] * metatable[i]["trim_read_depth"] <= 8000000)
        or (metatable[i]["distinct_tenmillion_prop"] < 0.2)
    ):
        metatable[i]["samp_qc_score"] = 4
    elif (
        (metatable[i]["trim_read_depth"] <= 15000000)
        or (metatable[i]["duplication_picard"] >= 0.65)
        or (metatable[i]["map_prop"] * metatable[i]["trim_read_depth"] <= 12000000)
        or (metatable[i]["distinct_tenmillion_prop"] < 0.35)
    ):
        metatable[i]["samp_qc_score"] = 3
    elif (
        (metatable[i]["trim_read_depth"] <= 20000000)
        or (metatable[i]["duplication_picard"] >= 0.5)
        or (metatable[i]["map_prop"] * metatable[i]["trim_read_depth"] <= 16000000)
        or (metatable[i]["distinct_tenmillion_prop"] < 0.5)
    ):
        metatable[i]["samp_qc_score"] = 2
    else:
        metatable[i]["samp_qc_score"] = 1

    if metatable[i]["genome_prop_cov"] == None or metatable[i]["exint_ratio"] == None:
        metatable[i]["samp_data_score"] = 0
    elif (metatable[i]["genome_prop_cov"] <= 0.04) or (
        metatable[i]["exint_ratio"] >= 9
    ):
        metatable[i]["samp_data_score"] = 5
    elif (metatable[i]["genome_prop_cov"] <= 0.08) or (
        metatable[i]["exint_ratio"] >= 7
    ):
        metatable[i]["samp_data_score"] = 4
    elif (metatable[i]["genome_prop_cov"] <= 0.12) or (
        metatable[i]["exint_ratio"] >= 5
    ):
        metatable[i]["samp_data_score"] = 3
    elif (metatable[i]["genome_prop_cov"] <= 0.16) or (
        metatable[i]["exint_ratio"] >= 3
    ):
        metatable[i]["samp_data_score"] = 2
    else:
        metatable[i]["samp_data_score"] = 1

#### Output database values for plotting

In [144]:
import csv

with open("/home/lsanford/Desktop/db_output.csv", "w") as f:
    w = csv.DictWriter(f, fieldnames=metatable[0].keys())
    w.writeheader()
    for data in metatable:
        w.writerow(data)

#### Input data into database

In [147]:
accum_keys = list(dict(config["accum keys"]).values())

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(metatable)):
        entry = sampleAccum(
            sample_id=metatable[i]["sample_id"],
            replicate=metatable[i][config["accum keys"]["replicate"]],
            single_paired=metatable[i][config["accum keys"]["single_paired"]],
            rcomp=metatable[i][config["accum keys"]["rcomp"]],
            expt_unusable=metatable[i][config["accum keys"]["expt_unusable"]],
            timecourse=metatable[i][config["accum keys"]["timecourse"]],
            baseline_control_expt=metatable[i][
                config["accum keys"]["baseline_control_expt"]
            ],
            notes=metatable[i][config["accum keys"]["notes"]],
            raw_read_depth=metatable[i]["raw_read_depth"],
            trim_read_depth=metatable[i]["trim_read_depth"],
            raw_read_length=metatable[i]["raw_read_length"],
            duplication_picard=metatable[i]["duplication_picard"],
            single_map=metatable[i]["single_map"],
            multi_map=metatable[i]["multi_map"],
            map_prop=metatable[i]["map_prop"],
            rseqc_tags=metatable[i]["rseqc_tags"],
            rseqc_cds=metatable[i]["rseqc_cds"],
            rseqc_five_utr=metatable[i]["rseqc_five_utr"],
            rseqc_three_utr=metatable[i]["rseqc_three_utr"],
            rseqc_intron=metatable[i]["rseqc_intron"],
            cds_rpk=metatable[i]["cds_rpk"],
            intron_rpk=metatable[i]["intron_rpk"],
            exint_ratio=metatable[i]["exint_ratio"],
            distinct_tenmillion_prop=metatable[i]["distinct_tenmillion_prop"],
            genome_prop_cov=metatable[i]["genome_prop_cov"],
            avg_fold_cov=metatable[i]["avg_fold_cov"],
            samp_qc_score=metatable[i]["samp_qc_score"],
            samp_data_score=metatable[i]["samp_data_score"],
        )
        session.merge(entry)

#### Add paper qc score to the exptMetadata table based on sample qc scores

In [148]:
score_update = []

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for paperid in session.query(exptMetadata.paper_id).distinct():
        qc_scores = []
        data_scores = []

        for qc_score in (
            session.query(sampleAccum.samp_qc_score)
            .join(linkIDs, linkIDs.sample_id == sampleAccum.sample_id)
            .join(exptMetadata, exptMetadata.expt_id == linkIDs.expt_id)
            .filter(exptMetadata.paper_id == paperid[0])
        ):
            qc_scores.append(qc_score[0])
        for data_score in (
            session.query(sampleAccum.samp_data_score)
            .join(linkIDs, linkIDs.sample_id == sampleAccum.sample_id)
            .join(exptMetadata, exptMetadata.expt_id == linkIDs.expt_id)
            .filter(exptMetadata.paper_id == paperid[0])
        ):
            data_scores.append(data_score[0])

        score_update.append(
            dict(
                identifier=paperid[0],
                paperqc=median(qc_scores),
                paperdata=median(data_scores),
            )
        )

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(score_update)):
        session.query(exptMetadata).filter(
            exptMetadata.paper_id == score_update[i]["identifier"]
        ).update(
            {
                exptMetadata.paper_qc_score: score_update[i]["paperqc"],
                exptMetadata.paper_data_score: score_update[i]["paperdata"],
            }
        )

### Create nascentflow and bidirectionalflow versions table and scrape version data

In [149]:
nascentflow_keys = list(dict(config["nascentflow keys"]).values())
bidirflow_keys = list(dict(config["bidirflow keys"]).values())
dirpath = config["file_locations"]["version_data"]

nf_table = []
bidir_table = []

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for row in (
        session.query(linkIDs.sample_id, exptMetadata.paper_id, sampleID.sample_name)
        .join(exptMetadata, exptMetadata.expt_id == linkIDs.expt_id)
        .join(sampleID, sampleID.sample_id == linkIDs.sample_id)
        .distinct()
    ):
        nf_table.append(
            {"sample_id": row[0], "paper_id": row[1], "sample_name": row[2]}
        )
        bidir_table.append(
            {"sample_id": row[0], "paper_id": row[1], "sample_name": row[2]}
        )

for i in range(len(nf_table)):
    paper = nf_table[i]["paper_id"]
    sampname = nf_table[i]["sample_name"]
    sampid = nf_table[i]["sample_id"]
#    nascent_path = dirpath + paper + "/" + sampname + "_nascent.yaml"
#    bidir_path = dirpath + paper + "/" + sampname + "_bidir.yaml"
    nascent_path = dirpath + sampname + "_nascent.yaml"
    bidir_path = dirpath + sampname + "_bidir.yaml"

    if not (os.path.exists(nascent_path) and os.path.isfile(nascent_path)):
        for key in nascentflow_keys:
            nf_table[i][key] = None
        for key in bidirflow_keys:
            bidir_table[i][key] = None
        continue

    with open(nascent_path) as f:
        j = 0
        for nfrun in yaml.safe_load_all(f):
            if j == 0:
                j = 1
                nf_table[i].update(nfrun)
                for key in nascentflow_keys:
                    if not key in nf_table[i]:
                        nf_table[i][key] = None
            else:
                nf_table.append(
                    {"sample_id": sampid, "paper_id": paper, "sample_name": sampname}
                )
                nf_table[-1].update(nfrun)
                for key in nascentflow_keys:
                    if not key in nf_table[-1]:
                        nf_table[-1][key] = None

    if not (os.path.exists(bidir_path) and os.path.isfile(bidir_path)):
        for key in bidirflow_keys:
            bidir_table[i][key] = None
        continue

    with open(bidir_path) as f:
        j = 0
        for bidirrun in yaml.safe_load_all(f):
            if j == 0:
                j = 1
                bidir_table[i].update(bidirrun)
                for key in bidirflow_keys:
                    if not key in bidir_table[i]:
                        bidir_table[i][key] = None
            else:
                bidir_table.append(
                    {"sample_id": sampid, "paper_id": paper, "sample_name": sampname}
                )
                bidir_table[-1].update(bidirrun)
                for key in bidirflow_keys:
                    if not key in bidir_table[-1]:
                        bidir_table[-1][key] = None

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(nf_table)):
        entry = nascentflowMetadata(
            sample_id=nf_table[i]["sample_id"],
            nascentflow_version=nf_table[i][config["nascentflow keys"]["nascentflow_version"]],
            pipeline_revision_hash=nf_table[i][config["nascentflow keys"]["pipeline_revision_hash"]],
            pipeline_hash=nf_table[i][config["nascentflow keys"]["pipeline_hash"]],
            nascentflow_date=nf_table[i][config["nascentflow keys"]["nascentflow_date"]],
            nascentflow_redo_date=nf_table[i][config["nascentflow keys"]["nascentflow_redo_date"]],
            nextflow_version=nf_table[i][config["nascentflow keys"]["nextflow_version"]],
            fastqc_version=nf_table[i][config["nascentflow keys"]["fastqc_version"]],
            bbmap_version=nf_table[i][config["nascentflow keys"]["bbmap_version"]],
            hisat2_version=nf_table[i][config["nascentflow keys"]["hisat2_version"]],
            samtools_version=nf_table[i][config["nascentflow keys"]["samtools_version"]],
            sratools_version=nf_table[i][config["nascentflow keys"]["sratools_version"]],
            preseq_version=nf_table[i][config["nascentflow keys"]["preseq_version"]],
            preseq_date=nf_table[i][config["nascentflow keys"]["preseq_date"]],
            rseqc_version=nf_table[i][config["nascentflow keys"]["rseqc_version"]],
            rseqc_date=nf_table[i][config["nascentflow keys"]["rseqc_date"]],
            java_version=nf_table[i][config["nascentflow keys"]["java_version"]],
            picard_gc_version=nf_table[i][config["nascentflow keys"]["picard_gc_version"]],
            picard_dups_version=nf_table[i][config["nascentflow keys"]["picard_dups_version"]],
            picard_date=nf_table[i][config["nascentflow keys"]["picard_date"]],
            bedtools_version=nf_table[i][config["nascentflow keys"]["bedtools_version"]],
            igvtools_version=nf_table[i][config["nascentflow keys"]["igvtools_version"]],
            seqkit_version=nf_table[i][config["nascentflow keys"]["seqkit_version"]],
            mpich_version=nf_table[i][config["nascentflow keys"]["mpich_version"]],
            gcc_version=nf_table[i][config["nascentflow keys"]["gcc_version"]],
            python_version=nf_table[i][config["nascentflow keys"]["python_version"]],
            numpy_version=nf_table[i][config["nascentflow keys"]["numpy_version"]],
        )
        session.merge(entry)

# with utils.NascentDBConnection(db_url=db_url) as session:
with NascentDBConnection(db_url="sqlite:///" + db_url) as session:
    for i in range(len(bidir_table)):
        entry = bidirflowMetadata(
            sample_id=bidir_table[i]["sample_id"],
            bidirflow_version=bidir_table[i][config["bidirflow keys"]["bidirflow_version"]],
            pipeline_revision_hash=bidir_table[i][config["bidirflow keys"]["pipeline_revision_hash"]],
            pipeline_hash=bidir_table[i][config["bidirflow keys"]["pipeline_hash"]],
            bidirflow_date=bidir_table[i][config["bidirflow keys"]["bidirflow_date"]],
            nextflow_version=bidir_table[i][config["bidirflow keys"]["nextflow_version"]],
            samtools_version=bidir_table[i][config["bidirflow keys"]["samtools_version"]],
            bedtools_version=bidir_table[i][config["bidirflow keys"]["bedtools_version"]],
            mpich_version=bidir_table[i][config["bidirflow keys"]["mpich_version"]],
            openmpi_version=bidir_table[i][config["bidirflow keys"]["openmpi_version"]],
            gcc_version=bidir_table[i][config["bidirflow keys"]["gcc_version"]],
            r_version=bidir_table[i][config["bidirflow keys"]["r_version"]],
            rsubread_version=bidir_table[i][config["bidirflow keys"]["rsubread_version"]],
            boost_version=bidir_table[i][config["bidirflow keys"]["boost_version"]],
            fstitch_version=bidir_table[i][config["bidirflow keys"]["fstitch_version"]],
            tfit_version=bidir_table[i][config["bidirflow keys"]["tfit_version"]],
            dreg_version=bidir_table[i][config["bidirflow keys"]["dreg_version"]],
            dreg_date=bidir_table[i][config["bidirflow keys"]["dreg_date"]],
            tfit_date=bidir_table[i][config["bidirflow keys"]["tfit_date"]],
            fcgene_date=bidir_table[i][config["bidirflow keys"]["fcgene_date"]],
        )
        session.merge(entry)