# Script for adding and updating DBNascent values

### Import

In [1]:
#from . import dbutils (in script)
import datetime

In [12]:
paper_id = "Aho2019displacement"
#paper_id = "Aeby2020decapping"
#paper_id = "Danko2018dynamic"
#paper_id = "Estaras2015smad"
#paper_id = "Duttke2015human"
#paper_id = "Zhu2018rna"

In [13]:
# %load dborm.py
# orm.py --- ORM for DBNascent
#
# Filename: orm.py
# Description: ORM for DBNascent
# Authors: Lynn Sanford <lynn.sanford@colorado.edu> and Zach Maas
# Created: Mon Jun 10 13:11:55 2019 (-0600)
# URL:
#

# Commentary:
#
# This file contains code for an ORM to interface with the Dowell
# Lab's Nascent Database.
#

# Code:

import sqlalchemy as sql
from sqlalchemy.ext.declarative import declarative_base

# Base class for our ORM
Base = declarative_base()


# MAIN TABLES
class organismInfo(Base):
    __tablename__ = "organismInfo"
#    metadata = MetaData()
    organism = sql.Column(
        sql.String(length=127), primary_key=True, index=True, unique=True
    )
    genome_build = sql.Column(sql.String(length=50))
    genome_bases = sql.Column(sql.BigInteger)


class searchEq(Base):
    __tablename__ = "searchEq"
#    metadata = MetaData()
    search_term = sql.Column(
        sql.String(length=250), primary_key=True, index=True, unique=True
    )
    db_term = sql.Column(sql.String(length=127))


class exptMetadata(Base):
    __tablename__ = "exptMetadata"
#    metadata = MetaData()
    expt_id = sql.Column(sql.Integer,
                         primary_key=True,
                         index=True,
                         unique=True)
    srp = sql.Column(sql.String(length=50))
    protocol = sql.Column(sql.String(length=50))
    organism = sql.Column(
        sql.String(length=127), sql.ForeignKey("organismInfo.organism")
    )
    library = sql.Column(sql.String(length=50))
    spikein = sql.Column(sql.String(length=127))
    paper_id = sql.Column(sql.String(length=127),
                          index=True)
    published = sql.Column(sql.Boolean)
    year = sql.Column(sql.Integer)
    first_author = sql.Column(sql.String(length=127))
    last_author = sql.Column(sql.String(length=127))
    doi = sql.Column(sql.String(length=300))
    curator1 = sql.Column(sql.String(length=50))
    curator2 = sql.Column(sql.String(length=50))
    other_sr_data = sql.Column(sql.Boolean)
    atac_seq = sql.Column(sql.Boolean)
    rna_seq = sql.Column(sql.Boolean)
    chip_seq = sql.Column(sql.Boolean)
    three_dim_seq = sql.Column(sql.Boolean)
    other_seq = sql.Column(sql.Boolean)
    paper_qc_score = sql.Column(sql.Float)
    paper_data_score = sql.Column(sql.Float)


class sampleID(Base):
    __tablename__ = "sampleID"
#    metadata = MetaData()
    srr = sql.Column(sql.String(length=50),
                     primary_key=True,
                     index=True,
                     unique=True)
    sample_name = sql.Column(sql.String(length=50),
                             index=True)
    sample_id = sql.Column(sql.Integer,
                           index=True)


class geneticInfo(Base):
    __tablename__ = "geneticInfo"
#    metadata = MetaData()
    genetic_id = sql.Column(sql.Integer,
                            primary_key=True,
                            index=True,
                            unique=True)
    organism = sql.Column(
        sql.String(length=127), sql.ForeignKey("organismInfo.organism")
    )
    sample_type = sql.Column(sql.String(length=127))
    cell_type = sql.Column(sql.String(length=127))
    clone_individual = sql.Column(sql.String(length=127))
    strain = sql.Column(sql.String(length=127))
    genotype = sql.Column(sql.String(length=127))
    construct = sql.Column(sql.String(length=127))


class conditionInfo(Base):
    __tablename__ = "conditionInfo"
#    metadata = MetaData()
    condition_id = sql.Column(sql.Integer,
                              primary_key=True,
                              index=True,
                              unique=True)
    condition_type = sql.Column(sql.String(length=127))
    treatment = sql.Column(sql.String(length=127))
    conc_intens = sql.Column(sql.String(length=50))
    start_time = sql.Column(sql.Integer)
    end_time = sql.Column(sql.Integer)
    time_unit = sql.Column(sql.String(length=50))
    duration = sql.Column(sql.Integer)
    duration_unit = sql.Column(sql.String(length=50))


exptCondition = sql.Table(
    "exptCondition",
    Base.metadata,
    sql.Column("sample_id",
               sql.Integer,
               sql.ForeignKey("sampleID.sample_id")),
    sql.Column("condition_id",
               sql.Integer,
               sql.ForeignKey("conditionInfo.condition_id")),
)


class linkIDs(Base):
    __tablename__ = "linkIDs"
#    metadata = MetaData()
    sample_id = sql.Column(
        sql.Integer,
        sql.ForeignKey("sampleID.sample_id"),
        primary_key=True,
        index=True,
        unique=True,
    )
    genetic_id = sql.Column(sql.Integer,
                            sql.ForeignKey("geneticInfo.genetic_id"))
    expt_id = sql.Column(sql.Integer,
                         sql.ForeignKey("exptMetadata.expt_id"))
    sample_name = sql.Column(sql.String(length=127),
                         sql.ForeignKey("sampleID.sample_name"))
    paper_id = sql.Column(sql.String(length=127),
                         sql.ForeignKey("exptMetadata.paper_id"))


class sampleAccum(Base):
    __tablename__ = "sampleAccum"
#    metadata = MetaData()
    sample_id = sql.Column(
        sql.Integer,
        sql.ForeignKey("sampleID.sample_id"),
        primary_key=True,
        index=True,
        unique=True,
    )
    replicate = sql.Column(sql.String(length=50))
    single_paired = sql.Column(sql.String(length=50))
    rcomp = sql.Column(sql.Boolean)
    expt_unusable = sql.Column(sql.Boolean)
    timecourse = sql.Column(sql.Boolean)
    baseline_control_expt = sql.Column(sql.String(length=50))
    notes = sql.Column(sql.String(length=300))
    raw_read_depth = sql.Column(sql.Integer)
    trim_read_depth = sql.Column(sql.Integer)
    raw_read_length = sql.Column(sql.Integer)
    duplication_picard = sql.Column(sql.Float)
    single_map = sql.Column(sql.Integer)
    multi_map = sql.Column(sql.Integer)
    map_prop = sql.Column(sql.Float)
    rseqc_tags = sql.Column(sql.Integer)
    rseqc_cds = sql.Column(sql.Integer)
    rseqc_five_utr = sql.Column(sql.Integer)
    rseqc_three_utr = sql.Column(sql.Integer)
    rseqc_intron = sql.Column(sql.Integer)
    cds_rpk = sql.Column(sql.Float)
    intron_rpk = sql.Column(sql.Float)
    exint_ratio = sql.Column(sql.Float)
    distinct_tenmillion_prop = sql.Column(sql.Float)
    genome_prop_cov = sql.Column(sql.Float)
    avg_fold_cov = sql.Column(sql.Float)
    samp_qc_score = sql.Column(sql.Integer)
    samp_data_score = sql.Column(sql.Integer)


class nascentflowMetadata(Base):
    __tablename__ = "nascentflowMetadata"
#    metadata = MetaData()
    nascentflow_id = sql.Column(
        sql.Integer, primary_key=True, index=True, unique=True
    )
    nascentflow_version = sql.Column(sql.String(length=127))
    pipeline_revision_hash = sql.Column(sql.String(length=127))
    pipeline_hash = sql.Column(sql.String(length=127))
    nascentflow_date = sql.Column(sql.Date)
    nascentflow_redo_date = sql.Column(sql.Date)
    nextflow_version = sql.Column(sql.String(length=127))
    fastqc_version = sql.Column(sql.String(length=127))
    bbmap_version = sql.Column(sql.String(length=127))
    hisat2_version = sql.Column(sql.String(length=127))
    samtools_version = sql.Column(sql.String(length=127))
    sratools_version = sql.Column(sql.String(length=127))
    preseq_version = sql.Column(sql.String(length=127))
    preseq_date = sql.Column(sql.Date)
    rseqc_version = sql.Column(sql.String(length=127))
    rseqc_date = sql.Column(sql.Date)
    java_version = sql.Column(sql.String(length=127))
    picard_gc_version = sql.Column(sql.String(length=127))
    picard_dups_version = sql.Column(sql.String(length=127))
    picard_date = sql.Column(sql.Date)
    bedtools_version = sql.Column(sql.String(length=127))
    igvtools_version = sql.Column(sql.String(length=127))
    seqkit_version = sql.Column(sql.String(length=127))
    mpich_version = sql.Column(sql.String(length=127))
    gcc_version = sql.Column(sql.String(length=127))
    python_version = sql.Column(sql.String(length=127))
    numpy_version = sql.Column(sql.String(length=127))


exptNascentflow = sql.Table(
    "exptNascentflow",
    Base.metadata,
    sql.Column("sample_id",
               sql.Integer,
               sql.ForeignKey("sampleID.sample_id")),
    sql.Column("nascentflow_id",
               sql.Integer,
               sql.ForeignKey("nascentflowMetadata.nascentflow_id")),
)


class bidirflowMetadata(Base):
    __tablename__ = "bidirflowMetadata"
#    metadata = MetaData()
    bidirflow_id = sql.Column(
        sql.Integer, primary_key=True, index=True, unique=True
    )
    bidirflow_version = sql.Column(sql.String(length=127))
    pipeline_revision_hash = sql.Column(sql.String(length=127))
    pipeline_hash = sql.Column(sql.String(length=127))
    bidirflow_date = sql.Column(sql.Date)
    nextflow_version = sql.Column(sql.String(length=127))
    samtools_version = sql.Column(sql.String(length=127))
    bedtools_version = sql.Column(sql.String(length=127))
    mpich_version = sql.Column(sql.String(length=127))
    openmpi_version = sql.Column(sql.String(length=127))
    gcc_version = sql.Column(sql.String(length=127))
    r_version = sql.Column(sql.String(length=127))
    rsubread_version = sql.Column(sql.String(length=127))
    boost_version = sql.Column(sql.String(length=127))
    fstitch_version = sql.Column(sql.String(length=127))
    tfit_version = sql.Column(sql.String(length=127))
    dreg_version = sql.Column(sql.String(length=127))
    dreg_date = sql.Column(sql.Date)
    tfit_date = sql.Column(sql.Date)
    fcgene_date = sql.Column(sql.Date)


exptBidirflow = sql.Table(
    "exptBidirflow",
    Base.metadata,
    sql.Column("sample_id",
               sql.Integer,
               sql.ForeignKey("sampleID.sample_id")),
    sql.Column("bidirflow_id",
               sql.Integer,
               sql.ForeignKey("bidirflowMetadata.bidirflow_id")),
)

# The following were created by Zach and we may or may not use...

# class tf(Base):
#    __tablename__ = "tf"
#    tf_id = sql.Column(sql.String(length=127), primary_key=True)
#    tf_alias = sql.Column(sql.String(length=127))


# class pipeline_status(Base):
#    __tablename__ = "pipeline_status"
#    srr_id = sql.Column(
#        sql.String(length=127),
#        sql.ForeignKey("srr_metadata.srr_id"),
#        primary_key=True,
#    )
#    fastqc_complete = sql.Column(sql.Boolean)
#    bbduk_complete = sql.Column(sql.Boolean)
#    hisat2_complete = sql.Column(sql.Boolean)
#    samtools_complete = sql.Column(sql.Boolean)
#    fastq_dump_complete = sql.Column(sql.Boolean)
#    pileup_complete = sql.Column(sql.String(length=127))
#    preseq_complete = sql.Column(sql.Boolean)
#    rseqc_complete = sql.Column(sql.String(length=127))
#    bedtools_complete = sql.Column(sql.Boolean)
#    igv_tools_complete = sql.Column(sql.Boolean)
#    fstitch_complete = sql.Column(sql.Boolean)
#    tfit_complete = sql.Column(sql.Boolean)


# class md_score(Base):
#    __tablename__ = "md_score"
#    srr_id = sql.Column(
#        sql.String(length=127),
#        sql.ForeignKey("srr_metadata.srr_id"),
#        primary_key=True,
#    )
#    tf_id = sql.Column(sql.String, sql.ForeignKey("tf.tf_id"))
#    erna_type = sql.Column(sql.String(length=127))
#    md_score_expected = sql.Column(sql.Integer)
#    md_score_std = sql.Column(sql.Integer)


# dborm.py ends here

In [32]:
# %load dbutils.py
"""Functions for building and maintaining DBNascent.

Filename: dbutils.py
Authors: Lynn Sanford <lynn.sanford@colorado.edu> and Zach Maas

Commentary:
    This module contains utility functions and classes for
    reducing the total amount of code needed for building and
    updating the database

Classes:
    dbnascentConnection
    Metatable

Functions:
    load_config(file) -> object
    add_tables(db_url)
    table_parse(file) -> list of dicts
    key_grab(dict, list) -> list of lists
    get_unique_table(file, list) -> dict
    value_compare(object, dict, dict)
    object_as_dict(object)
    scrape_fastqc(object) -> list of dicts

Misc variables:
"""

import configparser
import csv
import numpy as np
import os
import re
import pymysql
import sqlalchemy as sql
from sqlalchemy.ext.serializer import loads, dumps
from sqlalchemy.orm import sessionmaker
import shutil
from statistics import median
import yaml
import zipfile as zp
#from . import dborm


# Database Connection Handler
class dbnascentConnection:
    """A class to handle connection to the mysql database.

    Attributes:
        engine (dialect, pool objects) : engine created by sqlalchemy

        session (session object) : ORM session object created by sqlalchemy

    Methods:
        __enter__ :
    """

    engine = None
    _Session = None
    session = None

    def __init__(self, db_url, cred_path):
        """Initialize database connection.

        Parameters:
            db_url (str) : path to database (mandatory)

            cred_path (str) : path to tab-delimited credentials
                one line file with username tab password

        Returns:
            none
        """
        if cred_path:
            with open(cred_path) as f:
                cred = next(f).split("\t")
            self.engine = sql.create_engine("mysql+pymysql://" + str(cred[0]) + ":"
                                            + str(cred[1].split("\n")[0])
                                            + "@localhost/dbnascent", echo=False)
        elif db_url:
            self.engine = sql.create_engine("mysql+pymysql://" + db_url, echo=False)
        else:
            raise FileNotFoundError(
                "Database url must be provided"
            )
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

    def add_tables(self) -> None:
        """Add tables into database from ORM.

        Does not update existing tables.

        Parameters:
            none

        Returns:
            none
        """
        Base.metadata.create_all(self.engine)

    def reflect_table(self, table, filter_crit=None) -> list:
        """Query all records from a specific table.

        Can optionally add filtering criteria.

        Parameters:
            table (str) : string of table name from ORM

            filter_crit (dict) : filter criteria for table

        Returns:
            query_data (list of dicts) : all data in table
                                         matching filter criteria
        """
        query_data = []

        query_str = "SELECT * FROM " + table
        if filter_crit is not None:
            query_str = query_str + " WHERE "
            i = 0
            for key in filter_crit:
                if i == 0:
                    query_str = (query_str + str(key) +
                                 ' = "' + str(filter_crit[key]) + '"')
                    i = (i + 1)
                else:
                    query_str = (query_str + " AND " + str(key) +
                                 ' = "' + str(filter_crit[key]) + '"')

        sqlquery = self.session.execute(sql.text(query_str)).fetchall()

        for entry in sqlquery:
            query_data.append(dict(entry))

        return query_data

    def backup(self, out_path, tables=False):
        """Backup database (whole or specific tables).

        Parameters:
            out_path (str) : path to backup file directory

            tables (list) : list of specific tables, if whole
                            database backup is not desired

        Returns:
            none
        """
        if not tables:
            Base.metadata.reflect(bind=self.engine)
            tables = list(Base.metadata.tables.keys())
        for table in tables:
            outfile = out_path + "/" + table + ".dbdump"
            q = self.session.query(table)
            serialized_data = dumps(q.all())
            with open(outfile, 'w') as out:
                out.write(str(serialized_data))

    def restore(self, in_path, tables):
        """Restore database (whole or specific tables).

        Parameters:
            in_path (str) : path to backup file directory

            tables (list) : list of specific tables, if whole
                            database backup is not desired

        Returns:
            none
        """
        if not tables:
            files = os.listdir(in_path)
            tables = []
            for file in files:
                tables.append(file.split(".")[0])
        for table in tables:
            infile = in_path + "/" + table + ".dbdump"
            with open(infile) as f:
                serialized_data = dict(f)
            self.session.merge(serialized_data)

#    def __enter__(self):
#        return self.session
#
#    def __exit__(self, exc_type, exc_val, exc_tb):
#        self.session.commit()
#        self.engine.dispose()


# Metatable class definition
class Metatable:
    """A class to store metadata.

    Attributes:
        data (list of dicts) :

    Methods:
        load_file :
    """

    def __init__(self, meta_path, dictlist=None):
        """Initialize metatable object.

        Parameters:
            meta_path (str) : path to metadata file
                file must be tab-delimited with field names as header

            dictlist (list of dicts) : if not path, list of dicts
                this can convert a list of dicts into the self.data of
                a metatable object
        """
        self.data = []

        if meta_path:
            self.load_file(meta_path)
        elif dictlist:
            self.data = dictlist

    def load_file(self, meta_path):
        """Load metatable object.

        Parameters:
            meta_path (str) : path to metadata file
                file must be tab-delimited with field names as header

        Returns:
            self.data (list of dicts)
        """
        # Check that the metadata file exists
        if not (os.path.exists(meta_path)
                and os.path.isfile(meta_path)):
            raise FileNotFoundError(
                "Metadata file does not exist at the provided path")

        with open(meta_path, newline="") as metatab:
            full_table = list(csv.DictReader(metatab, delimiter="\t"))
            if len(full_table[0]) == 1:
                raise IndexError(
                    "Input must be tab-delimited. Double check input."
                )
            else:
                for entry in full_table:
                    self.data.append(dict(entry))

    def value_grab(self, key_list) -> list:
        """Extract values for specific keys from metatable data.

        Parameters:
            key_list (list) : desired keys from dicts in table_list

        Returns:
            value_list (list of lists) : each entry containing the values
                                         of the given keys
        """
        # Load in file as a list of dicts
        value_list = []

        if len(self.data) == 0:
            return value_list

        # Check if keys are valid
        for key in key_list:
            if key not in self.data[0]:
                raise KeyError(
                    "Key(s) not present in metatable object."
                )

        for entry in self.data:
            value_subset = []
            for key in key_list:
                value_subset.append(entry[key])
            value_list.append(value_subset)

        return value_list

    def key_grab(self, key_list) -> list:
        """Extract dicts with specific keys from metatable data.

        Parameters:
            key_list (list) : desired keys from dicts in table_list

        Returns:
            dict_list (list of dicts) : each entry containing the dicts
                                        with only the given keys
        """
        dict_list = []

        if len(self.data) == 0:
            return dict_list

        # Check if keys are valid
        for key in key_list:
            if key not in self.data[0]:
                raise KeyError(
                    "Key(s) not present in metatable object."
                )

        for entry in self.data:
            newentry = dict()
            for key in key_list:
                newentry[key] = entry[key]
            dict_list.append(newentry)

        return dict_list

    def key_replace(self, file_keys, db_keys):
        """Replace file keys with database keys.

        Parameters:
            file_keys (list) : list of keys in file

            db_keys (list) : list of keys in database
                Must be equivalent in length to file_keys

        Returns:
            self.data (list of dicts)
        """
        # Check if keys are valid
        for key in file_keys:
            if key not in self.data[0]:
                raise KeyError(
                    "Key(s) not present in metatable object."
                )

        for entry in self.data:
            for i in range(len(file_keys)):
                entry[db_keys[i]] = entry.pop(file_keys[i])

    def unique(self, extract_keys) -> list:
        """Extract values for specific keys from a metatable filepath.

        Parameters:
            extract_keys (list) : list containing db key labels for binding

        Returns:
            unique_metatable (list of dicts) : each entry contains the values
                                               of the extract keys; only
                                               returns unique sets of values
        """
        unique_metatable = []

        if len(self.data) == 0:
            return unique_metatable

        # Check if keys are valid
        for key in extract_keys:
            if key not in self.data[0]:
                raise KeyError(
                    "Key(s) not present in metatable object."
                )

        full_table_list = np.array(self.value_grab(extract_keys))
        unique_list = np.unique(full_table_list, axis=0)

        for entry in unique_list:
            new_dict = dict(zip(extract_keys, entry))
            unique_metatable.append(new_dict)

        return unique_metatable


# Configuration File Reader
def load_config(filename: str):
    """Load database config file compatible with configparser package.

    Parameters:
        filename (str) : path to config file

    Returns:
        config (configparser object) : parsed config file
    """
    if not os.path.exists(filename):
        raise FileNotFoundError(
            "Configuration file does not exist at the provided path"
        )
    config = configparser.ConfigParser()
    with open(filename) as confFile:
        config.read_string(confFile.read())
    return config


def value_compare(db_row, metatable_row, key_dict) -> bool:
    """Compare values between two dicts.

    Parameters:
        db_row (dict) : dict extracted from one entry in
                        one table of the database

        metatable_row (dict) : dict extracted from a metadata table

        key_dict (dict) : specific keys for comparison

    Returns:
        {0,1} (boolean) : whether the value in the database matches the
                          metadata value; 1 if matching, 0 if not
    """
    for key in key_dict:
        if db_row[key] == metatable_row[key_dict[key]]:
            continue
        else:
            return 0
    return 1


def listdict_compare(comp_dict, db_dict, db_keys) -> list:
    """Compare two lists of dicts and take any rows not already in db.
    Converts all values to strings for comparison purposes

    Parameters:
        comp_dict (list of dicts) : list of dicts from metatable object

        db_dict (list of dicts) : list of dicts extracted from db query

        db_keys (list) : specific keys for comparison

    Returns:
        data_to_add (list of dicts) : any dicts in comp_dict not in db_dict
    """
    data_to_add = []

    for entry in comp_dict:
        for key in db_keys:
            entry[key] = str(entry[key])

    for entry in db_dict:
        for key in db_keys:
            entry[key] = str(entry[key])

    for comp_entry in comp_dict:
        if comp_entry not in db_dict:
            data_to_add.append(comp_entry)

    return data_to_add


def key_store_compare(comp_dict, db_dict, db_keys, store_keys) -> list:
    """Compare two lists of dicts and take any rows not already in db.

    Converts all values to strings for comparison purposes

    Parameters:
        comp_dict (dict) : single dict from metatable object

        db_dict (list of dicts) : list of dicts extracted from db query

        db_keys (list) : specific keys for comparison

        store_keys (list) : key(s) for adding to dict

    Returns:
        new_dict (dict) : dict with new value added
    """
    for dbentry in db_dict:
        comp = 0
        for key in db_keys:
            if str(comp_dict[key]) != str(dbentry[key]):
                if str(comp_dict[key]) == "":
                    if dbentry[key] != None:
                        comp = 1
                else:
                    comp = 1
        if comp == 0:
            for stkey in store_keys:
                comp_dict[stkey] = dbentry[stkey]

    return comp_dict


def object_as_dict(obj):
    """Convert queried database entry into dict.

    Parameters:
        obj (str) : single row (entry) of a database query output

    Returns:
        db_dict (dict) : key-value pairs from database entry
    """
    db_dict = {c.key: getattr(obj, c.key) for c
               in sql.inspect(obj).mapper.column_attrs}
    return db_dict


def entry_update(dbtable, dbkeys, comp_table) -> list:
    """Find entries not already in database.

    Parameters:
        dbtable (str) : Which db table to search for entries

        dbkeys (list) : list of keys to use for comparison

        comp_table (list of dicts) : Entries to match (or not)
                                     to db entries

    Returns:
        to_add (list of dicts) : New entries not in db to add
    """
    db_dump = dbconnect.reflect_table(dbtable)
    dbtab = Metatable(meta_path=None, dictlist=db_dump)
    dbtab_data = dbtab.key_grab(dbkeys)
    to_add = listdict_compare(comp_table, dbtab_data, dbkeys)

    return to_add


def scrape_fastqc(paper_id, sample_name, data_path, db_sample) -> dict:
    """Scrape read length and depth from fastQC report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name

        data_path (str) : path to database storage directory

        db_sample (dict) : sample_accum entry dict from db query

    Returns:
        fastqc_dict (dict) : scraped fastqc metadata in dict format
    """
    fastqc_dict = {}

    # Determine paths for raw fastQC file to scrape, depending on SE/PE
    fqc_path = data_path + "qc/fastqc/zips/"
    if db_sample["single_paired"] == "paired":
        samp_zip = fqc_path + sample_name + "_1_fastqc"
    else:
        samp_zip = fqc_path + sample_name + "_fastqc"

    # If fastQC files don't exist, return null values
    if not (os.path.exists(samp_zip + ".zip")):
        fastqc_dict["raw_read_depth"] = None
        fastqc_dict["raw_read_length"] = None
        fastqc_dict["trim_read_depth"] = None
        return fastqc_dict

    # Unzip fastQC report
    with zp.ZipFile(samp_zip + ".zip", "r") as zp_ref:
        zp_ref.extractall(fqc_path)

    # Extract raw depth and read length
    fdata = open(samp_zip + "/fastqc_data.txt")
    for line in fdata:
        if re.compile("Total Sequences").search(line):
            fastqc_dict["raw_read_depth"] = int(line.split()[2])
        if re.compile("Sequence length").search(line):
            fastqc_dict["raw_read_length"] = int(line.split()[2].split("-")[0])

    # Remove unzipped file
    shutil.rmtree(samp_zip)

    # Determine paths for trimmed fastQC file to scrape, depending on SE/PE
    # and whether reverse complemented or not
    if str(db_sample["rcomp"]) == '1':
        if db_sample["single_paired"] == "paired":
            samp_zip = fqc_path + sample_name + "_1.flip.trim_fastqc"
        else:
            samp_zip = fqc_path + sample_name + ".flip.trim_fastqc"
    else:
        if db_sample["single_paired"] == "paired":
            samp_zip = fqc_path + sample_name + "_1.trim_fastqc"
        else:
            samp_zip = fqc_path + sample_name + ".trim_fastqc"

    # If trimmed fastQC report doesn't exist, return null value for
    # trimmed read depth
    if not (os.path.exists(samp_zip + ".zip")):
        fastqc_dict["trim_read_depth"] = None
        return fastqc_dict

    # Unzip trimmed fastQC report
    with zp.ZipFile(samp_zip + ".zip", "r") as zp_ref:
        zp_ref.extractall(fqc_path)

    # Extract trimmed read depth
    fdata = open(samp_zip + "/fastqc_data.txt")
    for line in fdata:
        if re.compile("Total Sequences").search(line):
            fastqc_dict["trim_read_depth"] = int(line.split()[2])

    # Remove unzipped file
    shutil.rmtree(samp_zip)

    return fastqc_dict


def scrape_picard(paper_id, sample_name, data_path):
    """Scrape read length and depth from picard duplication report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name derived from db query

        data_path (str) : path to database storage directory

    Returns:
        picard_dict (dict) : scraped picard metadata in dict format
    """
    picard_dict = {}

    dirpath = data_path + "qc/picard/dups/"
    filepath = dirpath + sample_name + ".marked_dup_metrics.txt"

    # If picardtools data doesn't exist, return null value
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        picard_dict["duplication_picard"] = None
        return picard_dict

    # Extract duplication data
    fdata = open(filepath)
    for line in fdata:
        if re.compile("Unknown Library").search(line):
            dup = float(line.split("\t")[8])
            picard_dict["duplication_picard"] = round(dup, 5)

    return picard_dict


def scrape_mapstats(paper_id, sample_name, data_path, db_sample):
    """Scrape read length and depth from hisat2 mapstats report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name derived from db query

        data_path (str) : path to database storage directory

        db_sample (dict) : sample_accum entry dict from db query

    Returns:
        mapstats_dict (dict) : scraped hisat2 metadata in dict format
    """
    mapstats_dict = {}

    dirpath = data_path + "qc/hisat2_mapstats/"
    filepath = dirpath + sample_name + ".hisat2_mapstats.txt"

    # If hisat mapping data doesn't exist, return null values
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        mapstats_dict["single_map"] = None
        mapstats_dict["multi_map"] = None
        mapstats_dict["map_prop"] = None
        return mapstats_dict

    fdata = open(filepath)

    # Sum up and report mapped reads for paired end data
    if db_sample["single_paired"] == "paired":
        for line in fdata:
            if re.compile("concordantly 1 time").search(line):
                reads = int(line.split(": ")[1].split(" (")[0]) * 2
            if re.compile("Aligned 1 time").search(line):
                mapstats_dict["single_map"] = reads + int(
                    line.split(": ")[1].split(" (")[0]
                )
            if re.compile("concordantly >1 times").search(line):
                reads = int(line.split(": ")[1].split(" (")[0]) * 2
            if re.compile("Aligned >1 times").search(line):
                mapstats_dict["multi_map"] = reads + int(
                    line.split(": ")[1].split(" (")[0]
                )
            if re.compile("Overall alignment rate").search(line):
                alrate = float(line.split(": ")[1].split("%")[0]) / 100
                mapstats_dict["map_prop"] = round(alrate, 5)
    # Report mapped reads for single end data
    else:
        for line in fdata:
            if re.compile("Aligned 1 time").search(line):
                mapstats_dict["single_map"] = int(line.split(": ")[1].split(" (")[0])
            if re.compile("Aligned >1 times").search(line):
                mapstats_dict["multi_map"] = int(line.split(": ")[1].split(" (")[0])
            if re.compile("Overall alignment rate").search(line):
                alrate = float(line.split(": ")[1].split("%")[0]) / 100
                mapstats_dict["map_prop"] = round(alrate, 5)

    return mapstats_dict


def scrape_rseqc(paper_id, sample_name, data_path):
    """Scrape read length and depth from RSeQC report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name derived from db query

        data_path (str) : path to database storage directory

    Returns:
        rseqc_dict (dict) : scraped RSeQC metadata in dict format
    """
    rseqc_dict = {}

    dirpath = data_path + "qc/rseqc/read_distribution/"
    filepath = dirpath + sample_name + ".read_distribution.txt"

    # If rseqc read distribution data doesn't exist, return null values
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        rseqc_dict["rseqc_tags"] = None
        rseqc_dict["rseqc_cds"] = None
        rseqc_dict["cds_rpk"] = None
        rseqc_dict["rseqc_five_utr"] = None
        rseqc_dict["rseqc_three_utr"] = None
        rseqc_dict["rseqc_intron"] = None
        rseqc_dict["intron_rpk"] = None
        rseqc_dict["exint_ratio"] = None
        return rseqc_dict

    # Extract RSeQC data
    fdata = open(filepath)
    for line in fdata:
        if re.compile("Total Assigned Tags").search(line):
            rseqc_dict["rseqc_tags"] = int(line.split()[-1])
        if re.compile("CDS_Exons").search(line):
            rseqc_dict["rseqc_cds"] = int(line.split()[2])
            cds = float(line.split()[-1])
            rseqc_dict["cds_rpk"] = round(cds, 5)
        if re.compile("5'UTR_Exons").search(line):
            rseqc_dict["rseqc_five_utr"] = int(line.split()[2])
        if re.compile("3'UTR_Exons").search(line):
            rseqc_dict["rseqc_three_utr"] = int(line.split()[2])
        if re.compile("Introns").search(line):
            rseqc_dict["rseqc_intron"] = int(line.split()[2])
            intron = float(line.split()[-1])
            rseqc_dict["intron_rpk"] = round(intron, 5)

    if rseqc_dict["intron_rpk"] > 0:
        exint_ratio = rseqc_dict["cds_rpk"] / rseqc_dict["intron_rpk"]
        rseqc_dict["exint_ratio"] = round(exint_ratio, 5)
    else:
        rseqc_dict["exint_ratio"] = None

    return rseqc_dict


def scrape_preseq(paper_id, sample_name, data_path):
    """Scrape read length and depth from preseq complexity report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name derived from db query

        data_path (str) : path to database storage directory

    Returns:
        preseq_dict (dict) : scraped preseq metadata in dict format
    """
    preseq_dict = {}

    dirpath = data_path + "qc/preseq/"
    filepath = dirpath + sample_name + ".lc_extrap.txt"

    # If preseq complexity data doesn't exist, return null value
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        preseq_dict["distinct_tenmillion_prop"] = None
        return preseq_dict

    fdata = open(filepath)
    for line in fdata:
        if line.startswith("10000000.0"):
            distinct = float(line.split()[1]) / 10000000

    preseq_dict["distinct_tenmillion_prop"] = round(distinct, 5)

    return preseq_dict


def scrape_pileup(paper_id, sample_name, data_path):
    """Scrape read length and depth from pileup report.

    Parameters:
        paper_id (str) : paper identifier

        sample_name (str) : sample name derived from db query

        data_path (str) : path to database storage directory

    Returns:
        pileup_dict (dict) : scraped pileup metadata in dict format
    """
    pileup_dict = {}

    dirpath = data_path + "qc/pileup/"
    filepath = dirpath + sample_name + ".coverage.stats.txt"

    # If pileup complexity data doesn't exist, return null value
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        pileup_dict["genome_prop_cov"] = None
        pileup_dict["avg_fold_cov"] = None
        return pileup_dict

    # Add up reads in different categories to calculate coverage
    fdata = open(filepath)
    x = 0
    total = cov = fold = 0
    for line in fdata:
        if x == 0:
            x = x + 1
            continue
        else:
            x = x + 1
            total = total + int(line.split("\t")[2])
            cov = cov + int(line.split("\t")[5])
            fold = fold + (float(line.split("\t")[1])
                           * int(line.split("\t")[2]))

    pileup_dict["genome_prop_cov"] = round((cov / total), 5)
    pileup_dict["avg_fold_cov"] = round((fold / total), 5)

    return pileup_dict


def sample_qc_calc(db_sample):
    """Calculate sample qc and data scores.

    Parameters:
        db_sample (dict) : sample_accum entry dict from db query

    Returns:
        samp_score (int) : calculated sample scores in dict format
    """
    samp_score = dict()
    trimrd = db_sample["trim_read_depth"]
    dup = db_sample["duplication_picard"]
    mapped = db_sample["map_prop"]
    complexity = db_sample["distinct_tenmillion_prop"]
    genome = db_sample["genome_prop_cov"]
    exint = db_sample["exint_ratio"]

    # Determine sample QC score
    if (trimrd is None
       or dup is None
       or mapped is None
       or complexity is None):

        samp_score["samp_qc_score"] = 0

    elif (trimrd <= 5000000
          or dup >= 0.95
          or (mapped * trimrd) <= 4000000
          or complexity < 0.05):

        samp_score["samp_qc_score"] = 5

    elif (trimrd <= 10000000
          or dup >= 0.80
          or (mapped * trimrd) <= 8000000
          or complexity < 0.2):

        samp_score["samp_qc_score"] = 4

    elif (trimrd <= 15000000
          or dup >= 0.65
          or (mapped * trimrd) <= 12000000
          or complexity < 0.35):

        samp_score["samp_qc_score"] = 3

    elif (trimrd <= 20000000
          or dup >= 0.5
          or (mapped * trimrd) <= 16000000
          or complexity < 0.5):

        samp_score["samp_qc_score"] = 2

    else:
        samp_score["samp_qc_score"] = 1

    # Determine sample data score
    if (genome is None
       or exint is None):

        samp_score["samp_data_score"] = 0

    elif (genome <= 0.04
          or exint >= 9):

        samp_score["samp_data_score"] = 5

    elif (genome <= 0.08
          or exint >= 7):

        samp_score["samp_data_score"] = 4

    elif (genome <= 0.12
          or exint >= 5):

        samp_score["samp_data_score"] = 3

    elif (genome <= 0.16
          or exint >= 3):
        samp_score["samp_data_score"] = 2

    else:
        samp_score["samp_data_score"] = 1

    return samp_score


def paper_qc_calc(db_samples):
    """Calculate sample qc and data scores.

    Parameters:
        db_samples (list of dicts) : sample_accum entries from db query

    Returns:
        paper_scores (float) : calculated median scores in dict format
    """
    qc_scores = []
    data_scores = []
    paper_scores = {}

    for entry in db_samples:
        qc_scores.append(int(entry["samp_qc_score"]))
        data_scores.append(int(entry["samp_data_score"]))

    paper_scores["paper_qc_score"] = median(qc_scores)
    paper_scores["paper_data_score"] = median(data_scores)

    return paper_scores


def add_version_info(paper_id, data_path, vertype, dbver_keys):
    """Find nascentflow/bidirflow version info for a paper.

    Parameters:
        paper_id (str) : paper identifier

        data_path (str) : path to dbnascent data

        vertype (str) : {"nascent", "bidir"} : Which nextflow type

        dbver_keys (list) : list of keys for version tables

    Returns:
        ver_table (list of dicts) : all relevant version info for
                                      entry into db
    """
    ver_table = []

    dblink_dump = dbconnect.reflect_table("linkIDs", {"paper_id": paper_id})
    for entry in dblink_dump:
        del entry["genetic_id"]
        del entry["expt_id"]
        ver_path = (data_path +
                    entry["sample_name"] + "_" + vertype + ".yaml")

        if not (os.path.exists(ver_path) and os.path.isfile(ver_path)):
            for key in dbver_keys:
                entry.update({key: None})
            ver_table.append(entry)
            continue

        with open(ver_path) as f:
            for run in yaml.safe_load_all(f):
                add_entry = dict()
                add_entry.update(entry)
                add_entry.update(run)
                for key in dbver_keys:
                    if not key in add_entry.keys():
                        add_entry.update({key: None})
                ver_table.append(add_entry)

    return ver_table


def dbnascent_backup(db, basedir, tables):
    """Create new database backup.

    Parameters:
        db (dbnascentConnection object) : current database connection

        basedir (str) : path to base backup directory
                        default /home/lsanford/Documents/data/dbnascent_backups

        tables (list) : list of specific tables if whole db backup
                        is not desired

    Returns:
        none
    """
    if not basedir:
        basedir = "/home/lsanford/Documents/data/dbnascent_backups"
    now = datetime.datetime.now()
    nowdir = now.strftime("%Y%m%d_%H%M%S")
    os.makedirs(basedir + "/" + nowdir)

    if tables:
        db.backup((basedir + "/" + nowdir), tables)
    else:
        db.backup((basedir + "/" + nowdir))


def paper_add_update(db, config, identifier, basedir):
    """Add or update paper and associated sample metadata.

    Parameters:
        db (dbnascentConnection object) : current database connection

        config (configParser object) : parsed config file

        identifier (str) : paper identifier, used to locate all (meta)data

        basedir (str) : path to base database data directory
                        default /Shares/dbnascent

    Returns:
        none
    """
    # Add experimental metadata
    expt_keys = list(dict(config["expt keys"]).values())
    if not basedir:
        basedir = "/Shares/dbnascent"
    exptmeta_path = basedir + "/" + identifier + "/"

    # Read in expt metadata and make sure entries are unique
    exptmeta = utils.Metatable(exptmeta_path + "metadata/expt_metadata.txt")
    expt_unique = exptmeta.unique(expt_keys)

    # Add expt metadata to database
    db.engine.execute(exptMetadata.__table__.insert(), expt_unique.data())

    # Add sample ids


#engine.execute(tablename.__table__.insert(),listofdicts)
#
# dbutils.py ends here

### Load config file

In [33]:
# config = dbutils.load_config("/home/lsanford/Documents/data/repositories/dbnascent_build/config.txt")
config = load_config(
    "/home/lsanford/Documents/data/repositories/DBNascent-build/config.txt"
)

### Define database location and (optionally) back up database

In [34]:
db_url = config["file_locations"]["database"]
creds = config["file_locations"]["credentials"]

#dbconnect = utils.dbnascentConnection(db_url, creds)
dbconnect = dbnascentConnection(db_url, creds)
#utils.dbnascent_backup(dbconnect)

### Add/update organism table

In [35]:
dbconnect = dbnascentConnection(db_url, creds)

org_keys = list(dict(config["organism keys"]).values())
dborg_keys = list(dict(config["organism keys"]).keys())
orgtable_path = config["file_locations"]["organism_table"]

# Read in organism table and make sure entries are unique
#orgs = dbutils.Metatable(orgtable_path)
orgs = Metatable(orgtable_path)
orgs.key_replace(org_keys, dborg_keys)
orgs_unique = orgs.unique(dborg_keys)

# If not already present, add data to database
orgs_to_add = entry_update("organismInfo", dborg_keys, orgs_unique)

if len(orgs_to_add) > 0:
    dbconnect.engine.execute(organismInfo.__table__.insert(), orgs_to_add)

### Parse paper and sample metadata tables

In [36]:
dbconnect = dbnascentConnection(db_url, creds)

expt_keys = list(dict(config["expt keys"]).values())
dbexpt_keys = list(dict(config["expt keys"]).keys())
exptmeta_path = str(config["file_locations"]["db_data"]) + str(paper_id) + "/metadata/expt_metadata.txt"
samp_keys = list(dict(config["sample keys"]).values())
dbsamp_keys = list(dict(config["sample keys"]).keys())
sampmeta_path = str(config["file_locations"]["db_data"]) + str(paper_id) + "/metadata/sample_metadata.txt"
genetic_keys = list(dict(config["genetic keys"]).values())
dbgenetic_keys = list(dict(config["genetic keys"]).keys())

# Read in experimental metadata
# expt = dbutils.Metatable(exptmeta_path)
expt = Metatable(exptmeta_path)

# Read in sample metadata and append experimental for whole metadata table
# samp = dbutils.Metatable(sampmeta_path)
samp = Metatable(sampmeta_path)
for entry in samp.data:
    entry.update(expt.data[0])
    if not entry["srz"]:
        entry["srz"] = entry["srr"]

samp.key_replace(samp_keys, dbsamp_keys)
samp.key_replace(expt_keys, dbexpt_keys)
samp.key_replace(genetic_keys, dbgenetic_keys)

expt_unique = samp.unique(dbexpt_keys)
samp_unique = samp.unique(dbsamp_keys)
gene_unique = samp.unique(dbgenetic_keys)

# If not already present, add data to database
expt_to_add = entry_update("exptMetadata", dbexpt_keys, expt_unique)
for entry in expt_to_add:
    for key in entry:
        if entry[key] == '1':
            entry[key] = True
        elif entry[key] == '0':
            entry[key] = False

if len(expt_to_add) > 0:
    dbconnect.engine.execute(exptMetadata.__table__.insert(), expt_to_add)

dbsamp_dump = dbconnect.reflect_table("sampleID")
dbsamp = Metatable(meta_path=None, dictlist=dbsamp_dump)
curr_id = 0
for entry in dbsamp.data:
    if entry["sample_id"] > curr_id:
        curr_id = entry["sample_id"]

samp_to_add = entry_update("sampleID", dbsamp_keys, samp_unique)

if len(samp_to_add) > 0:
    samps_meta = Metatable(meta_path=None, dictlist=samp_to_add)
    samp_id_hash = samps_meta.unique(["sample_name"])
    for entry in samp_id_hash:
        curr_id = curr_id + 1
        entry["sample_id"] = curr_id
    for entry in samp_to_add:
        hash_entry = list(filter(lambda samp_id_hash: samp_id_hash["sample_name"]
                                 == entry["sample_name"], samp_id_hash))[0]
        entry["sample_id"] = hash_entry["sample_id"]

    dbconnect.engine.execute(sampleID.__table__.insert(), samp_to_add)

gene_to_add = entry_update("geneticInfo", dbgenetic_keys, gene_unique)

if len(gene_to_add) > 0:
    dbconnect.engine.execute(geneticInfo.__table__.insert(), gene_to_add)

### Make linkIDs table

In [37]:
dbconnect = dbnascentConnection(db_url, creds)

link_keys = ["sample_id", "genetic_id", "expt_id",
             "sample_name", "paper_id"]

dbsamp_dump = dbconnect.reflect_table("sampleID")
dbexpt_dump = dbconnect.reflect_table("exptMetadata")
dbgene_dump = dbconnect.reflect_table("geneticInfo")

for entry in samp.data:
    key_store_compare(entry, dbsamp_dump,
                      dbsamp_keys, ["sample_id"])
    key_store_compare(entry, dbexpt_dump,
                      dbexpt_keys, ["expt_id"])
    key_store_compare(entry, dbgene_dump,
                      dbgenetic_keys, ["genetic_id"])

link_unique = samp.unique(link_keys)

link_to_add = entry_update("linkIDs", link_keys, link_unique)

if len(link_to_add) > 0:
    dbconnect.engine.execute(linkIDs.__table__.insert(), link_to_add)

### Add condition info and build condition table

In [38]:
dbconnect = dbnascentConnection(db_url, creds)

cond_keys = list(dict(config["metatable condition keys"]).values())
dbcond_keys = list(dict(config["metatable condition keys"]).keys())
dbcond_keys.append("sample_name")
cond_full_keys = list(dict(config["condition keys"]).values())
dbcond_full_keys = list(dict(config["condition keys"]).keys())

samp.key_replace(cond_keys, dbcond_keys)
cond = samp.key_grab(dbcond_keys)
cond_parsed = []
for entry in cond:
    if entry["treatment"]:
        cond_types = entry["condition_type"].split(";")
        treatments = entry["treatment"].split(";")
        times = entry["times"].split(";")
        for i in range(len(cond_types)):
            new_entry = dict()
            tx = treatments[i].split("(")
            tm = times[i].split(",")
            new_entry["sample_name"] = entry["sample_name"]
            new_entry["condition_type"] = cond_types[i]
            new_entry["treatment"] = tx[0]
            if len(tx) > 1:
                new_entry["conc_intens"] = tx[1].split(")")[0]
            else:
                new_entry["conc_intens"] = ""
            new_entry["start_time"] = int(tm[0])
            new_entry["end_time"] = int(tm[1])
            new_entry["time_unit"] = tm[2]

            duration = int(tm[1]) - int(tm[0])

            # Calculate duration and units
            if tm[2] == "s":
                if duration % 60 == 0:
                    if duration % 3600 == 0:
                        if duration % 86400 == 0:
                            duration = duration / 86400
                            duration_unit = "day"
                        else:
                            duration = duration / 3600
                            duration_unit = "hr"
                    else:
                        duration = duration / 60
                        duration_unit = "min"
                else:
                    duration_unit = "s"
            elif tm[2] == "min":
                if duration % 60 == 0:
                    if duration % 1440 == 0:
                        duration = duration / 1440
                        duration_unit = "day"
                    else:
                        duration = duration / 60
                        duration_unit = "hr"
                else:
                    duration_unit = "min"
            elif tm[2] == "hr":
                if duration % 24 == 0:
                    duration = duration / 24
                    duration_unit = "day"
                else:
                    duration_unit = "hr"
            else:
                duration_unit = "day"

            new_entry["duration"] = int(duration)
            new_entry["duration_unit"] = duration_unit

            cond_parsed.append(new_entry)

    else:
        new_entry = dict()
        new_entry["sample_name"] = entry["sample_name"]
        new_entry["condition_type"] = "no treatment"
        new_entry["treatment"] = ""
        new_entry["conc_intens"] = ""
        new_entry["start_time"] = ""
        new_entry["end_time"] = ""
        new_entry["time_unit"] = ""
        new_entry["duration"] = ""
        new_entry["duration_unit"] = ""
        cond_parsed.append(new_entry)

cond = Metatable(meta_path=None, dictlist=cond_parsed)
cond_unique = cond.unique(dbcond_full_keys)
for entry in cond_unique:
    if entry["start_time"] == "":
        entry["start_time"] = None
    if entry["end_time"] == "":
        entry["end_time"] = None
    if entry["duration"] == "":
        entry["duration"] = None

cond_to_add = entry_update("conditionInfo", dbcond_full_keys, cond_unique)

if len(cond_to_add) > 0:
    for entry in cond_to_add:
        if entry["start_time"] == "None":
            entry["start_time"] = None
        if entry["end_time"] == "None":
            entry["end_time"] = None
        if entry["duration"] == "None":
            entry["duration"] = None
    dbconnect.engine.execute(conditionInfo.__table__.insert(), cond_to_add)

### Make condition match table

In [39]:
dbconnect = dbnascentConnection(db_url, creds)

dbcond_add_keys = list(dict(config["condition keys"]).keys())
dbcond_add_keys.append("condition_id")

dbcond_dump = dbconnect.reflect_table("conditionInfo")
dbcond = Metatable(meta_path=None, dictlist=dbcond_dump)
dbcond_data = dbcond.key_grab(dbcond_add_keys)

dbsamp_dump = dbconnect.reflect_table("sampleID")
dbsamp = Metatable(meta_path=None, dictlist=dbsamp_dump)
name_id = dbsamp.unique(["sample_name", "sample_id"])

for entry in cond_parsed:
    for eq in name_id:
        if entry["sample_name"] == eq["sample_name"]:
            entry["sample_id"] = eq["sample_id"]
#    dbutils.key_store_compare(entry, dbcond_data,
#                              dbcond_full_keys, "condition_id")
    key_store_compare(entry, dbcond_data,
                      dbcond_full_keys, ["condition_id"])

exptcond = Metatable(meta_path=None, dictlist=cond_parsed)
exptcond_unique = exptcond.unique(["sample_id", "condition_id"])

exptcond_to_add = entry_update("exptCondition",
                               ["sample_id", "condition_id"],
                               exptcond_unique)

if len(exptcond_to_add) > 0:
    dbconnect.engine.execute(exptCondition.insert(), exptcond_to_add)

### Make sampleAccum table

In [40]:
dbconnect = dbnascentConnection(db_url, creds)

# Load data location and keys
data_path = config["file_locations"]["data"]

accum_keys = list(dict(config["metatable accum keys"]).values())
dbaccum_keys = list(dict(config["metatable accum keys"]).keys())
dbaccum_full_keys = list(dict(config["accum keys"]).keys())

samp.key_replace(accum_keys, dbaccum_keys)

for entry in samp.data:
    fastqc_dict = scrape_fastqc(entry["paper_id"],
                                entry["sample_name"],
                                data_path,
                                entry)
    pic_dict = scrape_picard(entry["paper_id"],
                             entry["sample_name"],
                             data_path)
    mapstats_dict = scrape_mapstats(entry["paper_id"],
                               entry["sample_name"],
                               data_path,
                               entry)
    rseqc_dict = scrape_rseqc(entry["paper_id"],
                              entry["sample_name"],
                              data_path)
    preseq_dict = scrape_preseq(entry["paper_id"],
                                entry["sample_name"],
                                data_path)
    pileup_dict = scrape_pileup(entry["paper_id"],
                                entry["sample_name"],
                                data_path)
    entry.update(fastqc_dict)
    entry.update(pic_dict)
    entry.update(mapstats_dict)
    entry.update(rseqc_dict)
    entry.update(preseq_dict)
    entry.update(pileup_dict)
    score_dict = sample_qc_calc(entry)
    entry.update(score_dict)
    rep_num = re.split(r'(\d+)', entry["replicate"])
    entry["replicate"] = rep_num[1]

for entry in samp.data:
    for key in entry:
        entry[key] = str(entry[key])
accum_unique = samp.unique(dbaccum_full_keys)
accum_to_add = entry_update("sampleAccum", dbaccum_full_keys, accum_unique)

if len(accum_to_add) > 0:
    for entry in accum_unique:
        for key in ["rcomp", "expt_unusable", "timecourse"]:
            if entry[key] == '1':
                entry[key] = True
            elif entry[key] == '0':
                entry[key] = False
        for key in entry:
            if entry[key] == 'None':
                entry[key] = None
    dbconnect.engine.execute(sampleAccum.__table__.insert(), accum_to_add)

IntegrityError: (pymysql.err.IntegrityError) (1062, "Duplicate entry '13' for key 'PRIMARY'")
[SQL: INSERT INTO `sampleAccum` (sample_id, replicate, single_paired, rcomp, expt_unusable, timecourse, baseline_control_expt, notes, raw_read_depth, trim_read_depth, raw_read_length, duplication_picard, single_map, multi_map, map_prop, rseqc_tags, rseqc_cds, rseqc_five_utr, rseqc_three_utr, rseqc_intron, cds_rpk, intron_rpk, exint_ratio, distinct_tenmillion_prop, genome_prop_cov, avg_fold_cov, samp_qc_score, samp_data_score) VALUES (%(sample_id)s, %(replicate)s, %(single_paired)s, %(rcomp)s, %(expt_unusable)s, %(timecourse)s, %(baseline_control_expt)s, %(notes)s, %(raw_read_depth)s, %(trim_read_depth)s, %(raw_read_length)s, %(duplication_picard)s, %(single_map)s, %(multi_map)s, %(map_prop)s, %(rseqc_tags)s, %(rseqc_cds)s, %(rseqc_five_utr)s, %(rseqc_three_utr)s, %(rseqc_intron)s, %(cds_rpk)s, %(intron_rpk)s, %(exint_ratio)s, %(distinct_tenmillion_prop)s, %(genome_prop_cov)s, %(avg_fold_cov)s, %(samp_qc_score)s, %(samp_data_score)s)]
[parameters: ({'sample_id': '13', 'replicate': '1', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'control', 'notes': '', 'raw_read_depth': '56094956', 'trim_read_depth': '36189411', 'raw_read_length': '75', 'duplication_picard': '0.39838', 'single_map': '26645914', 'multi_map': '2995616', 'map_prop': '0.8191', 'rseqc_tags': '25639716', 'rseqc_cds': '1653249', 'rseqc_five_utr': '2577663', 'rseqc_three_utr': '1589094', 'rseqc_intron': '15608076', 'cds_rpk': '42.94', 'intron_rpk': '11.61', 'exint_ratio': '3.69854', 'distinct_tenmillion_prop': '0.69112', 'genome_prop_cov': '0.11552', 'avg_fold_cov': '0.5171', 'samp_qc_score': '1', 'samp_data_score': '3'}, {'sample_id': '14', 'replicate': '2', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'control', 'notes': '', 'raw_read_depth': '44129264', 'trim_read_depth': '18573270', 'raw_read_length': '75', 'duplication_picard': '0.32614', 'single_map': '13446032', 'multi_map': '1540694', 'map_prop': '0.8069', 'rseqc_tags': '12987278', 'rseqc_cds': '687701', 'rseqc_five_utr': '1054746', 'rseqc_three_utr': '823928', 'rseqc_intron': '8328891', 'cds_rpk': '17.86', 'intron_rpk': '6.2', 'exint_ratio': '2.88065', 'distinct_tenmillion_prop': '0.69283', 'genome_prop_cov': '0.07505', 'avg_fold_cov': '0.22618', 'samp_qc_score': '2', 'samp_data_score': '4'}, {'sample_id': '15', 'replicate': '1', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '84710660', 'trim_read_depth': '41087789', 'raw_read_length': '75', 'duplication_picard': '0.43535', 'single_map': '29047411', 'multi_map': '3393104', 'map_prop': '0.7895', 'rseqc_tags': '28050940', 'rseqc_cds': '1797694', 'rseqc_five_utr': '2492047', 'rseqc_three_utr': '1759399', 'rseqc_intron': '17424953', 'cds_rpk': '46.69', 'intron_rpk': '12.96', 'exint_ratio': '3.60262', 'distinct_tenmillion_prop': '0.68799', 'genome_prop_cov': '0.11505', 'avg_fold_cov': '0.50455', 'samp_qc_score': '1', 'samp_data_score': '3'}, {'sample_id': '16', 'replicate': '2', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '59525596', 'trim_read_depth': '39172480', 'raw_read_length': '75', 'duplication_picard': '0.38753', 'single_map': '29129825', 'multi_map': '3122930', 'map_prop': '0.8234', 'rseqc_tags': '27916474', 'rseqc_cds': '1717499', 'rseqc_five_utr': '2541045', 'rseqc_three_utr': '1731260', 'rseqc_intron': '17312138', 'cds_rpk': '44.61', 'intron_rpk': '12.88', 'exint_ratio': '3.46351', 'distinct_tenmillion_prop': '0.71163', 'genome_prop_cov': '0.12578', 'avg_fold_cov': '0.565', 'samp_qc_score': '1', 'samp_data_score': '2'}, {'sample_id': '17', 'replicate': '1', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '60523507', 'trim_read_depth': '39951956', 'raw_read_length': '75', 'duplication_picard': '0.39059', 'single_map': '29195629', 'multi_map': '3405099', 'map_prop': '0.816', 'rseqc_tags': '28295177', 'rseqc_cds': '1792066', 'rseqc_five_utr': '2849165', 'rseqc_three_utr': '1723644', 'rseqc_intron': '17329426', 'cds_rpk': '46.55', 'intron_rpk': '12.89', 'exint_ratio': '3.61133', 'distinct_tenmillion_prop': '0.71284', 'genome_prop_cov': '0.12519', 'avg_fold_cov': '0.58644', 'samp_qc_score': '1', 'samp_data_score': '2'}, {'sample_id': '18', 'replicate': '2', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '52641132', 'trim_read_depth': '35630012', 'raw_read_length': '75', 'duplication_picard': '0.37848', 'single_map': '26569716', 'multi_map': '2999461', 'map_prop': '0.8299', 'rseqc_tags': '25721784', 'rseqc_cds': '1644545', 'rseqc_five_utr': '2529541', 'rseqc_three_utr': '1584259', 'rseqc_intron': '15796934', 'cds_rpk': '42.72', 'intron_rpk': '11.75', 'exint_ratio': '3.63574', 'distinct_tenmillion_prop': '0.71142', 'genome_prop_cov': '0.12037', 'avg_fold_cov': '0.55256', 'samp_qc_score': '1', 'samp_data_score': '2'}, {'sample_id': '19', 'replicate': '1', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '59529059', 'trim_read_depth': '40781058', 'raw_read_length': '75', 'duplication_picard': '0.40753', 'single_map': '30122417', 'multi_map': '3261109', 'map_prop': '0.8186', 'rseqc_tags': '29030861', 'rseqc_cds': '1855487', 'rseqc_five_utr': '2788554', 'rseqc_three_utr': '1821217', 'rseqc_intron': '17886108', 'cds_rpk': '48.2', 'intron_rpk': '13.31', 'exint_ratio': '3.62134', 'distinct_tenmillion_prop': '0.70478', 'genome_prop_cov': '0.1235', 'avg_fold_cov': '0.57963', 'samp_qc_score': '1', 'samp_data_score': '2'}, {'sample_id': '20', 'replicate': '2', 'single_paired': 'single', 'rcomp': 1, 'expt_unusable': 0, 'timecourse': 1, 'baseline_control_expt': 'experimental', 'notes': '', 'raw_read_depth': '63176118', 'trim_read_depth': '31904393', 'raw_read_length': '75', 'duplication_picard': '0.43755', 'single_map': '21704085', 'multi_map': '2818342', 'map_prop': '0.7686', 'rseqc_tags': '21272991', 'rseqc_cds': '1466412', 'rseqc_five_utr': '2005314', 'rseqc_three_utr': '1346742', 'rseqc_intron': '13049900', 'cds_rpk': '38.09', 'intron_rpk': '9.71', 'exint_ratio': '3.92276', 'distinct_tenmillion_prop': '0.67936', 'genome_prop_cov': '0.09965', 'avg_fold_cov': '0.39567', 'samp_qc_score': '1', 'samp_data_score': '3'})]
(Background on this error at: http://sqlalche.me/e/13/gkpj)

### Add paper qc/data scores

In [None]:
dbconnect = dbnascentConnection(db_url, creds)
paper_scores = paper_qc_calc(accum_unique)
dbconnect.engine.execute(exptMetadata.__table__.update().
                         where(exptMetadata.__table__.c.paper_id == paper_id),
                         paper_scores)

### Add version data

In [49]:
dbconnect = dbnascentConnection(db_url, creds)

nf_keys = list(dict(config["nascentflow keys"]).values())
dbnf_keys = list(dict(config["nascentflow keys"]).keys())
bf_keys = list(dict(config["bidirflow keys"]).values())
dbbf_keys = list(dict(config["bidirflow keys"]).keys())
dirpath = config["file_locations"]["version_data"]

nftab = add_version_info(paper_id, dirpath, "nascent", dbnf_keys)
for entry in nftab:
    for key in entry:
        entry[key] = str(entry[key])

bidirtab = add_version_info(paper_id, dirpath, "bidir", dbbf_keys)
for entry in bidirtab:
    for key in entry:
        entry[key] = str(entry[key])

nf_table = Metatable(meta_path=None, dictlist=nftab)
nf_unique = nf_table.unique(dbnf_keys)
nf_to_add = entry_update("nascentflowMetadata", dbnf_keys, nf_unique)

if len(nf_to_add) > 0:
    for entry in nf_to_add:
        for key in dbnf_keys:
            if entry[key] == "None":
                entry[key] = None
    dbconnect.engine.execute(nascentflowMetadata.__table__.insert(), nf_to_add)

bidir_table = Metatable(meta_path=None, dictlist=bidirtab)
bidir_unique = bidir_table.unique(dbbf_keys)
bf_to_add = entry_update("bidirflowMetadata", dbbf_keys, bidir_unique)

if len(bf_to_add) > 0:
    for entry in bf_to_add:
        for key in dbbf_keys:
            if entry[key] == "None":
                entry[key] = None
    dbconnect.engine.execute(bidirflowMetadata.__table__.insert(), bf_to_add)

### Connect version data to samples

In [50]:
dbconnect = dbnascentConnection(db_url, creds)
dbnf_add_keys = list(dict(config["nascentflow keys"]).keys())
dbnf_add_keys.append("nascentflow_id")
dbbf_add_keys = list(dict(config["bidirflow keys"]).keys())
dbbf_add_keys.append("bidirflow_id")

dbnf_dump = dbconnect.reflect_table("nascentflowMetadata")
dbnf = Metatable(meta_path=None, dictlist=dbnf_dump)
dbnf_data = dbnf.key_grab(dbnf_add_keys)

dbbf_dump = dbconnect.reflect_table("bidirflowMetadata")
dbbf = Metatable(meta_path=None, dictlist=dbbf_dump)
dbbf_data = dbbf.key_grab(dbbf_add_keys)

for entry in nf_table.data:
    key_store_compare(entry, dbnf_data,
                      dbnf_keys, ["nascentflow_id"])

for entry in bidir_table.data:
    key_store_compare(entry, dbbf_data,
                      dbbf_keys, ["bidirflow_id"])

exptnf = Metatable(meta_path=None, dictlist=nf_table.data)
exptnf_unique = exptnf.unique(["sample_id", "nascentflow_id"])
exptnf_to_add = entry_update("exptNascentflow",
                             ["sample_id", "nascentflow_id"],
                             exptnf_unique)

if len(exptnf_to_add) > 0:
    dbconnect.engine.execute(exptNascentflow.insert(), exptnf_to_add)

exptbf = Metatable(meta_path=None, dictlist=bidir_table.data)
exptbf_unique = exptbf.unique(["sample_id", "bidirflow_id"])
exptbf_to_add = entry_update("exptBidirflow",
                             ["sample_id", "bidirflow_id"],
                             exptbf_unique)

if len(exptbf_to_add) > 0:
    dbconnect.engine.execute(exptBidirflow.insert(), exptbf_to_add)