pubPrepGeneDir

#!/usr/bin/env python
# prepare the data for the mutation finder
# e.g. uniprot sequences, entrez mapping, etc

# in this file the prefix "up" always refers to "uniProt" not to the word "up" (like "down")

# load default python packages
from __future__ import print_function
import sys, logging, optparse, os, glob, shutil, gzip, collections, marshal, re, zlib, cPickle
import types, string, sqlite3
import struct, itertools, dumbdbm
import urllib, urllib2
import xml.etree.ElementTree as et
from os.path import *
from collections import defaultdict
from datetime import date

import bedLoci
import unidecode

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubConf, pubGeneric, util, maxbio, maxCommon, pslMapBed, pubEutils, geneFinder
import pubKeyVal
from maxCommon import runCommand, makeOrCleanDir
from os.path import *
from pm_pycbio.hgdata import Psl
from Bio import SeqIO
import fastFind

# target directory for all variant information
varDataDir = pubConf.varDataDir

# files not needed by end users, only at UCSC for building
varBuildDir = pubConf.varBuildDir

DBSNPVERSION = "142"
# all possible commands for this script
allSteps = ["genePmids", "entrez", "refseq", "seqs",
    "uniprot", "refseqMap", "uniprotMap", "snp",
    "omim",
    "geneNames", "geneDict",
    "lociNames", "bandLoci",
    "cells",
    "accsGenbank", "accsUniprot", "accsPdb", "accsSts",
    # not used anymore
    "oldRefseqProtGet", "oldRefseqProtMap",
    "oldRefseqGet", "oldRefseqMap",
    "genbankProtGet", "genbankProtParse", "genbankProtMap",
    ]

genbankDir = "/hive/data/outside/genbank/data/download/genbank.208.0"

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [step] - reformat various gene-centric databases into tables or DBM filesfor gene/mutation finding or resolution.
Output goes to directory {}.

Possible steps are:
{}

For accession finding:
- geneNames: do the raw parsing of the entrez/hugo/uniprot databases
- lociNames: split hg19 into separate loci around exons and assign them to gene symbols
- bandLoci: get cytogenetic bands from hg19 and using lociNames, assign them to genes in them
- accsGenbank: parse genbank accession prefixes + digit lengths into text file
- accsUniprot: index uniprot accessions into sqlite db
- accsPdb: index pdb accessions into text file

For the gene dictionary:
- geneNames: do the raw parsing of the entrez/hugo/uniprot databases
- geneDict: disambiguate and filter the gene names and put into for fastFind

for the disambiguation and sequence lookup steps of the gene finder:
- genePmids: convert entrez genes pmid <-> gene table to a DBM database and marshal file
- entrez: table (entrezId, refSeqId, refSeqProtId) from entrez for human genes
- refseq: from genome browser hg19: table (protId, transId, cdsStart)
- uniprot: using uniprot, write a table that links entrezId, uniprotId and genbankIds (also marshal)
- genbankProtGet: get genbank sequences linked from human uniprot
- omim: get list of OMIM gene ids
- lociNames: split hg19 into separate loci around exons and assign them to gene symbols
- bandLoci: get cytogenetic bands from hg19 and using lociNames, assign them to genes in them

for variant finding and resolution:
- cells: get a list of cell names from cellosaurus, used as a blacklist for mutations
- snp: index dbSNP data, coord -> identifier and identifier -> coord
- refseqMap: get refSeq PSL for hg19
- seqs: load human refseq sequences into a DBM database
- entrez: table with entrezId,symbol,refseqProtIds,refseqIds,cdsStart

variant finding, but not necessary currently as they didn't improve the recall:
- uniprotMap: get uniprot -> genome PSL for hg19
- oldRefseqProtGet: map archived refseqProtIds to new Ids
- oldRefseqProtMap: get archived refseq prots -> genome PSL for hg19
- genbankProtParse: write sequences from genbankProtGet into a DBM db
- genbankProtMap: get genbank -> genome PSL for hg19

""".format(pubConf.geneDataDir, "|".join(allSteps)))

parser.add_option("-t", "--runTests", dest="test", action="store_true", help="run tests")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

# ==== FUNCTIONS =====
taxToDb = {9606 : "hg19"}
geneDataDir = None

# variation data base directory
varDataDir = pubConf.varDataDir

def writeGzDict(data, fname, appendFnames=None, toLower=False):
    logging.info("Writing %d keys to %s" % (len(data), fname))
    ofh = gzip.open(fname, "w")

    for key, valList in data.iteritems():
        for val in valList:
            assert("|" not in val)
        if toLower:
            valList = [v.lower() for v in valList]
        ofh.write("%s\t%s\n" % (key, "|".join(valList)))
    if appendFnames:
        for fn, prefix in appendFnames.iteritems():
            logging.info("Appending %s to %s" % (fn, fname))
            for line in open(fn):
                ofh.write(prefix)
                ofh.write(line)
    ofh.close()

def getOmimGeneIds():
    """
    return list of omim gene IDs
    """
    tmpFname = join(geneDataDir, "omim.genemap2.tmp")
    if not isfile(tmpFname):
        #omimUrl = "ftp://grcf.jhmi.edu/OMIM/genemap2.txt"
        omimUrl = pubConf.omimUrl
        logging.info("Reading OMIM genemap from %s" % omimUrl)
        data = urllib2.urlopen(omimUrl).read()
        #data = open(omimUrl).read()
        open(tmpFname, "w").write(data)
    else:
        data = open(tmpFname).read()
    lines = data.splitlines()
    geneIds = [int(l.split("|")[8]) for l in lines]
    logging.info("Got %d omim gene IDs" % len(geneIds))
    return geneIds

def appendToDict(data, idList, acc, stripVer=False, prefix=None):
    " add an entry acc -> for all IDs in idList to data, which is a defaultdict of set"
    assert(type(idList)==list)
    for id in idList:
        if stripVer:
            id = id.split(".")[0]
        if prefix:
            id = prefix+id
        #print id, acc
        data[str(id)].add(str(acc))

def parseUniprotLinks(uniprotDir, taxId):
    """ parse uniprot tab and return as dict.
        keys of dict:
            entrezToUp: entrezId -> upId
            upToSym: upId -> symbol
            upToIsos: upId -> list of sequence isoform IDs
            upToGbs: upId -> list of gbank accessions
            upToGbProts: upId -> list of protein gb accessions
            upToRefseqProt: upId -> list of refseq protein IDs
            accToUps: any accession -> list of uniprots.
                     versions are stripped from refseq and genbank, omim and entrez
                     are prefixd with "omim" and "entrez" to distinguish them
    """
    tabFname = join(uniprotDir, "uniprot.tab")
    logging.info("Parsing uniprot links from %s" % tabFname)
    entrezToUp = {}
    upToSym = {}
    upToIsos = {}
    upToGb = {}
    upToGbProts = {}
    upToRefseqProt = {}
    upToRefseq = {}
    accToUps = defaultdict(set)
    gbToGbProt = {}
    upToEntrez = {}

    allOmimIds = set(getOmimGeneIds())

    entrezCount = 0
    duplCount = 0
    u2eDuplCount =0
    for row in maxCommon.iterTsvRows(tabFname, encoding=None):
        acc = str(row.acc)

        if int(row.taxonId)==taxId:
            if row.geneName!="":
                upToSym[acc] = str(row.geneName)

            if row.hgncSym!="":
                hugo = row.hgncSym.split(",")
                appendToDict(accToUps, hugo, acc)

            if row.ec!="":
                ecIds = row.ec.split(",")
                appendToDict(accToUps, ecIds, acc)

            if row.refSeqProt!="":
                refseqProtIds = row.refSeqProt.split("|")
                upToRefseqProt.setdefault(row.acc, []).extend(refseqProtIds)
                appendToDict(accToUps, refseqProtIds, acc, stripVer=True)

            if row.refSeq!="":
                refseqIds = row.refSeq.split("|")
                upToRefseq.setdefault(row.acc, []).extend(refseqIds)
                appendToDict(accToUps, refseqIds, acc, stripVer=True)

            if row.pdb!="":
                pdbIds = row.pdb.split("|")
                appendToDict(accToUps, pdbIds, acc)

            if row.ensemblGene!="":
                ensIds = row.ensemblGene.split("|")
                appendToDict(accToUps, ensIds, acc)
            if row.ensemblProt!="":
                ensIds = row.ensemblProt.split("|")
                appendToDict(accToUps, ensIds, acc)

            omimIds = None
            if row.omimGene!="":
                omimIds = row.omimGene.split("|")
                appendToDict(accToUps, omimIds, acc, stripVer=True, prefix="omim")

            if row.entrezGene!="":
                ncbiGenes = row.entrezGene.split("|")
                appendToDict(accToUps, ncbiGenes, acc, stripVer=True, prefix="entrez")
                for ncbiGene in ncbiGenes:
                    ncbiGene = int(ncbiGene)
                    entrezCount += 1
                    if ncbiGene in entrezToUp:
                        duplCount +=1
                    entrezToUp.setdefault(ncbiGene, []).append(row.acc)
                    if row.acc in upToEntrez:
                        u2eDuplCount += 1
                    upToEntrez.setdefault(row.acc, []).append(int(ncbiGene))

            if row.isoIds!="":
                upToIsos.setdefault(row.acc, []).extend(row.isoIds.split("|"))

            if row.emblMrna!="":
                #emblIds = row.emblProt.split("|")
                emblIds = row.emblMrna.split("|")
                upToGb[row.acc] = emblIds
                appendToDict(accToUps, emblIds, acc, stripVer=True)

            if row.emblMrnaProt!="":
                protIds = row.emblMrnaProt.split("|")
                upToGbProts[row.acc] = protIds
                #print protIds, emblIds
                #print len(protIds), len(emblIds)
                assert(len(protIds)==len(emblIds))
                appendToDict(accToUps, protIds, acc, stripVer=True)

    logging.info("%d entrez-uniprot links (cases with several uniprots for a gene rec: %d)" % \
        (entrezCount, duplCount))

    logging.info("%d uniprot-entrez links (cases with several genes for a up rec: %d)" % \
        (len(upToEntrez), u2eDuplCount))

    # convert id -> set to id->list
    accToUpList = {}
    for acc, upSet in accToUps.iteritems():
        accToUpList[acc]=list(upSet)

    data = {}
    data["entrezToUp"] = entrezToUp
    data["upToEntrez"] = upToEntrez
    data["upToSym"] = upToSym
    data["upToGbs"] = upToGb
    data["upToGbProts"] = upToGbProts
    data["upToIsos"] = upToIsos
    data["upToRefseqProt"] = upToRefseqProt
    data["upToRefseq"] = upToRefseq
    data["accToUps"] = accToUpList
    return data

def parseEntrezGeneRefseq(outFname):
    " create a tab-sep file with human entrezGene, comma sep refseqIds, comma sep refseqProtIds"
    fname = join(pubConf.ncbiGenesDir, "gene2refseq.gz")
    logging.info("Parsing %s" % fname)
    # parse refseq into dicts
    refseqs = {}
    refprots = {}
    refsym = {}
    for line in gzip.open(fname):
       if not line.startswith("9606"):
           continue
       fs = line.strip("\n").split("\t")
       if not fs[0]=="9606":
           continue
       #print fs
       #if len(fs)<7:
            # some genes have no refseq info
            #continue
       #Format: tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi protein_accession.version protein_gi genomic_nucleotide_accession.version genomic_nucleotide_gi start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation assembly mature_peptide_accession.version mature_peptide_gi Symbol (tab is used as a separator, pound sign - start of a comment)
       tax, geneId, desc, refseqId, gir, refProtId, gip = fs[:7]
       sym = fs[15]
       if desc=="SUPPRESSED":
           continue
       if sym!="-":
           refsym[int(geneId)] = sym
       if refseqId!="-":
           refseqs.setdefault(int(geneId), set()).add(refseqId)
       if refProtId!="-":
           refprots.setdefault(int(geneId), set()).add(refProtId)

    # output dicts to tab sep file
    logging.info("tab output...")
    ofh = open(outFname, "w")
    ofh.write("\t".join(["entrezId", "sym", "refseqIds", "refseqProtIds"]))
    ofh.write("\n")
    for geneId, refseqIds in refseqs.iteritems():
        refseqProtIds = refprots.get(geneId, [])
        sym = refsym.get(geneId, "")
        row = [str(geneId), sym, ",".join(refseqIds), ",".join(refseqProtIds)]
        ofh.write("\t".join(row))
        ofh.write("\n")
    ofh.close()
    logging.info("Wrote %s" % outFname)

    # write to marshal file
    #outFname += ".marshal"
    #data = {}
    #data["entrez2refseqs"]  = refseqs
    #data["entrez2refprots"] = refprots
    #data["entrez2sym"] = refsym
    #marshal.dump(data, open(outFname, "w"))
    #logging.info("Wrote %s" % outFname)

def parseEntrezGenePmids(taxIds, dbm):
    " convert pmid <-> gene assignments from entrez genes to a dbm file "
    pmid2geneFname = join(pubConf.ncbiGenesDir, "gene2pubmed.gz")
    # at ucsc: /hive/data/outside/ncbi/genes/gene2pubmed.gz
    logging.info("Parsing %s" % pmid2geneFname)
    pmidToEntrez = {}
    for line in gzip.open(pmid2geneFname):
        if line.startswith("#"):
            continue
        row = line.rstrip("\n").split("\t")
        rowTax, entrezId, pmid = row
        if int(rowTax) in taxIds:
            pmidToEntrez.setdefault(int(pmid), []).append(entrezId)
    logging.info("Taxons %s: found entrez ids for %s pmids" % (taxIds, len(pmidToEntrez)))

    logging.info("Writing to dbm file")
    count = 0
    #data = {}
    for pmid, entrezList in pmidToEntrez.iteritems():
        if count%10000==0:
            print(count)
        pmid = str(pmid)
        entrezStr = ",".join(entrezList)
        dbm[pmid] = entrezStr
        #dbm2[pmid] = entrezStr
        #data[int(pmid)] = entrezStr
        count += 1

def faToDbm(faName, dbm):
    logging.info("indexing %s into dbm as seqs" % faName)
    faSizeOfh = open(faName+".size", "w")
    logging.info("Loading %s" % faName)
    for seqId, seq in maxbio.parseFasta(faName):
        #dbm[seqId] = zlib.compress(seq)
        dbm[seqId] = seq
    #logging.info("Converted %s to dbm" % (faName))

def parseRaWriteToTab(raName, tabName):
    " write refseqId.version, refProt ID and cds Start to a tabular file "
    logging.info("Parsing ra")
    ofh = open(tabName, "w")
    ofh.write("refSeq\trefProt\tcdsStart\n")
    id = None
    cds = None
    prt = None
    data = {}
    skipRec = False
    skipCount = 0
    accList = []
    for line in open(raName):
        if line.startswith("acc"):
            id = line.rstrip("\n").split()[1]
            continue
        if line.startswith("ver"):
            ver = line.rstrip("\n").split()[1]
            continue
        if line.startswith("prt"):
            prt = line.rstrip("\n").split()[1]
            continue
        if line.startswith("cds"):
            cds = line.rstrip("\n").split()[1].split(".")[0]
            if "join" in line:
                skipRec = True
            continue
        if line=="\n" and id!=None and cds!=None and prt!=None:
            if skipRec:
                skipCount += 1
            else:
                acc = id+"."+ver
                row = [acc, prt, cds]
                ofh.write("\t".join(row))
                ofh.write("\n")
                accList.append(acc)
            id = None
            cds = None
            prt = None
            skipRec = False
    ofh.close()
    logging.info("Wrote cds and pep/refseq assignment to %s" % tabName)
    logging.info("Skipped %d records" % skipCount)
    return accList

def makeOldToNewAccs(accs):
    """ given a list of new things like NM_000325.5,
    return dict with mapping old -> new, like
    "NM_000325.4": "NM_000325.5", "NM_000325.3" : "NM_000325.5", etc.
    """
    oldToNew = {}
    for newAcc in accs:
        prefix,suffix = newAcc.split(".")
        version = int(suffix)-1
        if version!=0:
            oldVersions = range(0, version)
            oldVersions = [ov+1 for ov in oldVersions]
            for oldVersion in range(0, version):
                oldVersion = oldVersion+1
                oldAcc = prefix+"."+str(oldVersion)
                oldToNew[oldAcc] = newAcc
    return oldToNew

def parseRefseq(taxId, geneDataDir):
    """ get prot <-> trans assignment and cdsStart for hg19 """
    assert(taxId==9606)
    raName =  join(geneDataDir, "refseq.%s.ra"  % str(taxId))
    logging.info("Getting ra to %s" % raName)
    cmd = "gbGetSeqs -gbRoot=/hive/data/outside/genbank RefSeq mrna %s -get=ra -db=hg19 -inclVersion -native" % raName
    maxCommon.runCommand(cmd)
    refseqInfoFname =  join(geneDataDir, "refseqInfo.tab")
    accList = parseRaWriteToTab(raName, refseqInfoFname)
    os.remove(raName)

def getRefseqs(transFaName, protFaName):
    " extract refseq sequences from UCSC genome browser database "
    cmdTemp = "gbGetSeqs -gbRoot=/hive/data/outside/genbank RefSeq %s %s -db=hg19 -inclVersion"
    for seqType, fname in [("mrna", transFaName), ("pep", protFaName)]:
        logging.info("Getting data for %s" % seqType)
        cmd = cmdTemp % (seqType, fname)
        maxCommon.runCommand(cmd)
    logging.info("Wrote fastas to %s and %s" % (transFaName, protFaName))

def loadSeqs(taxId, uniprotDir):
    """ parse refseq sequences as values to sqlite file """
    assert(taxId==9606)

    # get old refseqs, too
    #oldRefseqFname = join(geneDataDir, "oldRefseq.%s.gb")
    #logging.info("Downloading old refseqs to %s" % oldRefseqFname)
    #oldAccs = makeOldAccs(accList)
    #outFh = open(oldRefseqFname, "w")
    #chunkedDownloadFromEutils(oldAccs, outFh)
    #outFh.write("\n".join(oldAccs[:1000]))
    #assert(False)

    # get fastas for refseq sequences
    transFaName = join(varBuildDir, "refseq.%s.trans.fa" % str(taxId))
    protFaName =  join(varBuildDir, "refseq.%s.prot.fa"  % str(taxId))
    getRefseqs(transFaName, protFaName)

    # index fastas
    seqFname = join(pubConf.varDataDir, "seqs")
    #upFaName = join(uniprotDir, "uniprot.%d.var.fa.gz" % taxId)
    #gbFaName = join(varBuildDir, "genbank.%d.prot.fa" % taxId)
    #oldRefseqFaName = join(varBuildDir, "oldRefseq.%d.prot.fa" % taxId)

    db = pubKeyVal.SqliteKvDb(seqFname, singleProcess=True, newDb=True, \
        tmpDir=pubConf.getFastTempDir(), onlyUnique=True)
    #faToDbm(oldRefseqFaName, db)
    #faToDbm(upFaName, db)
    #faToDbm(gbFaName, db)
    faToDbm(transFaName, db)
    faToDbm(protFaName, db)

    db.close()

    #dbmFname = join(geneDataDir, "seqs.dbm")
    #shutil.copy(dbmTmpFname, dbmFname)
    #os.remove(dbmTmpFname)
    logging.info("Finished writing all seqs to %s" % seqFname)

def writeUpRefseqPairs(taxId, uniprotDir, proteinType, pairFname):
    " write a list of tuples uniprot-protein id, refseqId"
    upTabFname = join(uniprotDir, "uniprot.tab")
    ret = []
    ofh = open(pairFname, "w")
    upData = parseUniprotLinks(uniprotDir, taxId)
    upToRefseq = upData["upToRefseq"]
    if proteinType=="uniprot":
        upToProts = upData["upToIsos"]
    elif proteinType=="genbank":
        upToProts = upData["upToGbProts"]

    for upId, refseqIds in upToRefseq.iteritems():
        for refseq in refseqIds:
            for protId in upToProts.get(upId, []):
                ofh.write("%s\t%s\n" % (protId, refseq))
    ofh.close()
    return ofh.name

def mapProtToRefseqIndex(taxId, protType, protFname, uniprotDir, stripVersion=False):
    """ map from proteins given protein fa and pairs to refseq.
    create psl and compressed dbm of psls
    protType can be genbank or uniprot
    """
    outPrefix = "%sToRefseq" % protType
    tmpDir    = join(pubConf.mapReduceTmpDir, "protRefseqMap-"+outPrefix)
    makeOrCleanDir(tmpDir)

    pairFname = join(tmpDir, "%s.%d.pairs" % (outPrefix, taxId))
    writeUpRefseqPairs(taxId, uniprotDir, protType, pairFname)

    dbmFname  = join(varDataDir, "%s.%d" % (outPrefix, taxId))
    mapFname  = join(varBuildDir, "%s.%d.psl" % (outPrefix, taxId))

    mapProtToRefseq(taxId, tmpDir, protFname)
    filterPsls(tmpDir, pairFname, mapFname)
    loadPslToDbm(mapFname, dbmFname, stripVersion=stripVersion)

def filterPsls(tmpDir, pairFname, mapFname):
    """ pick the best alignment for each protein """
    pslDir = join(tmpDir, "psl")
    cmd = """ find %(pslDir)s -name '*.psl' | xargs cat | pslSelect -qtPairs=%(pairFname)s stdin stdout | sort -k 10,10 | pslCDnaFilter stdin -minQSize=20 -ignoreNs -globalNearBest=0 -bestOverlap -filterWeirdOverlapped stdout | sort | uniq > %(mapFname)s""" % locals()
    runCommand(cmd)
    logging.info("Wrote results to %s" % mapFname)

def mapDnaToRefseq(taxId, tmpDir, dnaFaName):
    logging.debug("mapping %s to refseq, tmpdir %s" % (dnaFaName, tmpDir))
    refseqFname = join(varBuildDir, "refseq.%s.trans.fa" % str(taxId))
    targetFname = join(tmpDir, "target.fa")
    logging.info("Copying %s to %s" % (refseqFname, targetFname))
    shutil.copy(refseqFname, targetFname)

    # split query into pieces
    queryDir = join(tmpDir, "queries")
    makeOrCleanDir(queryDir)
    if dnaFaName.endswith(".gz"):
        cmd = "gunzip %s -c | faSplit about stdin 500 %s/" % (dnaFaName, queryDir)
    else:
        cmd = "faSplit about %s 500 %s/" % (dnaFaName, queryDir)
    maxCommon.runCommand(cmd)

    pslDir = join(tmpDir, "psl")
    makeOrCleanDir(pslDir)

    jbl = open(join(tmpDir, "jobList"), "w")
    logging.info("Created %s" % jbl.name)
    faFnames = glob.glob(join(queryDir, "*.fa"))
    logging.debug("Found %d query split files" % len(faFnames))
    runner = pubGeneric.makeClusterRunner("pubPrepMutDir-mapDnaRefseq-%s" % basename(dnaFaName))
    for fname in faFnames:
        outPslName = join(pslDir, splitext(basename(fname))[0]+".psl")
        cmd = "blat %(targetFname)s %(fname)s {check out exists %(outPslName)s} -noHead" % locals()
        runner.submit(cmd)
    runner.finish(wait=True)

def mapProtToRefseq(taxId, tmpDir, protFaName):
    """ create a psl file with the best mapping protein -> refseq
    input is protein fa , output goes into tmpDir/psl
    """
    logging.debug("mapping %s to refseq, tmpdir %s" % (protFaName, tmpDir))
    refseqFname = join(geneDataDir, "refseq.%s.prot.fa" % str(taxId))
    BLASTDIR="/cluster/bin/blast/x86_64/blast-2.2.16/bin"
    targetFname = join(tmpDir, "refseq.prot.fa")
    logging.info("Copying %s to %s" % (refseqFname, targetFname))
    shutil.copy(refseqFname, targetFname)

    # split uniprot into pieces
    queryDir = join(tmpDir, "queries")
    makeOrCleanDir(queryDir)
    if protFaName.endswith(".gz"):
        cmd = "gunzip %s -c | faSplit about stdin 2500 %s/" % (protFaName, queryDir)
    else:
        cmd = "faSplit about %s 2500 %s/" % (protFaName, queryDir)
    runCommand(cmd)
    # index refseq for blast
    cmd = "%s/formatdb -i %s -p T" % (BLASTDIR, targetFname)
    runCommand(cmd)

    # make dir for the output psl files
    pslDir = join(tmpDir, "psl")
    makeOrCleanDir(pslDir)

    # create joblist
    jbl = open(join(tmpDir, "jobList"), "w")
    logging.info("Created %s" % jbl.name)
    faFnames = glob.glob(join(queryDir, "*.fa"))
    logging.debug("Found %d part files" % len(faFnames))
    jobScriptFname = join(pubConf.ucscScriptDir, "mapUniprot_doBlast")
    runner = pubGeneric.makeClusterRunner("pubPrepMutDir-mapProtRefseq-%s" % basename(protFaName))
    for fname in faFnames:
        outPslName = join(pslDir, splitext(basename(fname))[0]+".psl")
        cmd = "%(jobScriptFname)s %(targetFname)s blastp %(fname)s {check out exists %(outPslName)s}" % locals()
        runner.submit(cmd)
    runner.finish(wait=True)

class FastSqlite:
    " a wrapper around sqlite to load data in the fastest way possible "

    def __init__(self, fname, tableCreateSql, batchSize=100000):
        """ open sqlite db in the fastest mode possible, insert row in batches
        The db contains only one table and it must be called "data".
        """
        self.batch = []
        self.batchSize = batchSize
        self.tmpFname = join(pubConf.getFastTempDir(), basename(fname))
        self.dbFname = fname
        if isfile(self.tmpFname):
            os.remove(self.tmpFname)
        isolLevel = "exclusive"

        self.con = sqlite3.connect(self.tmpFname, isolation_level=isolLevel)
        maxCommon.delOnExit(self.tmpFname)

        self.con.execute("PRAGMA synchronous=OFF") # recommended by
        self.con.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
        self.con.execute("PRAGMA page_size=4096") # http://stackoverflow.com/questions/788568/sqlite3-disabling-primary-key-index-while-inserting
        self.con.execute("PRAGMA cache_size=1000000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html NOTE: this is the number of cached pages, so 4GB!
        self.con.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
        self.con.execute("PRAGMA temp_store=memory")
        self.con.commit()

        self.con.execute(tableCreateSql)
        self.con.commit()

    def _insertBatch(self):
        " write the current batch to the db "
        if len(self.batch)>0:
            sql = "INSERT INTO data VALUES (%s)" % ",".join(["?"]*self.batchRowLen)
            self.con.executemany(sql, self.batch)
            self.con.commit()
            self.batch = []

    def insert(self, row):
        " insert into table data as batches of rows "
        self.batchRowLen = len(row)
        self.batch.append( row )
        if len(self.batch) > self.batchSize:
            self._insertBatch()

    def close(self):
        " close db and move temp file over to final fname "
        if len(self.batch)>0:
            self._insertBatch()
        self.con.commit()
        self.con.close()

        logging.info("Copying %s to %s" % (self.tmpFname, self.dbFname))
        shutil.copy(self.tmpFname, self.dbFname)
        os.remove(self.tmpFname)

def dbSnp(taxId, geneDataDir):
    """ index dbSnp data, most of these won't be used as they're in intergenic
    regions but we keep them for now
    """
    assert(taxId==9606)
    tmpDir = varBuildDir

    snpFname = join(tmpDir, "snp%s.tab" % DBSNPVERSION)
    logging.info("dumping snp table into file %s" % snpFname)
    if not isfile(snpFname):
        cmd = '''hgsql hg19 -NB -e 'select chrom, chromStart, chromEnd, name from snp%s' > %s''' % \
            (DBSNPVERSION, snpFname)
        maxCommon.runCommand(cmd)
    else:
        logging.info("%s already exists, delete if you want to restart" % snpFname)

    sql = "CREATE TABLE data (chrom TEXT, start INT, end INT, rsId INT PRIMARY KEY)"
    dbName = join(varDataDir, "dbSnp.sqlite")
    db = FastSqlite(dbName, sql)
    count = 1
    doneRsIds = set()

    for line in open(snpFname):
        if count % 1000000 == 0:
            print("%d" % count)
        chrom, start, end, rsId = line.rstrip("\n").split("\t")
        # ignore haps and _gl
        if "hap" in chrom or "_gl" in chrom:
            continue
        intRsId = int(rsId[2:])

        if intRsId in doneRsIds:
            logging.warn("RsId %d already done" % intRsId)
            continue
        doneRsIds.add(intRsId)

        row = (chrom, int(start), int(end), intRsId)
        db.insert(row)
        count += 1

    # index by chrom pos
    print("adding chrom position index")
    db.con.execute("CREATE INDEX chromIdx ON data (chrom, start, end);")
    db.con.commit()

    db.close()

def loadPslToDbm(pslFname, dbmFname, isProt=False, stripVersion=False):
    " load psl file into a sqlite file, ignore haplotypes and unplaced seqs"
    qNamePsls = defaultdict(list)
    count = 0
    for line in open(pslFname):
        line = line.rstrip("\n")
        fields = line.split("\t")
        qName = fields[9]
        tName = fields[13]
        if "hap" in tName or "_gl" in tName:
            continue
        count += 1
        if isProt:
            p = Psl(line.split("\t"))
            p.protToNa()
            line = str(p)
        if stripVersion:
            qName = qName.split(".")[0]
        qNamePsls[qName].append(line)

    # write to dbm as \n separated psl lines indexed by qName
    #dbm = gdbm.open(dbmFname, "nf")
    #dbm = pubKeyVal.SqliteKvDb(dbmFname, singleProcess=True, newDb=True, \
                    #tmpDir=pubConf.getFastTempDir(), compress=True)
    logging.info("Writing psls to %s" % dbmFname)
    dbm = pubKeyVal.SqliteKvDb(dbmFname, singleProcess=True)
    for qName, pslLines in qNamePsls.iteritems():
        dbm[qName]= "\n".join(pslLines)
    dbm.close()
    logging.info("Indexed %d psls (%d qNames) from %s to %s" % \
        (count, len(qNamePsls), pslFname, dbm.dispName()))

def refseqMap(taxId, geneDataDir):
    " get refseq -> genome map from browser "
    db = taxToDb[taxId]
    refPslFname = join(varBuildDir, "refGenePsls.%d" % taxId)
    cmd = 'hgsql -NB hg19 -e "select * from refSeqAli" | grep -v hap | grep -v _gl | cut -f2- > %s' % refPslFname
    runCommand(cmd)

    refPslDbmFname = join(varDataDir, "refGenePsls.%s" % taxId)
    loadPslToDbm(refPslFname, refPslDbmFname)

def gbToFa(inGbFname, faFname):
    logging.info("Converting %s to %s" % (inGbFname, faFname))
    seqs = []
    outfh = open(faFname, "w")
    for rec in SeqIO.parse(open(inGbFname, "rU"), "genbank") :
        #seqs.append(record)
        outfh.write(">%s\n%s\n" % (rec.id, rec.seq.tostring()))

    #SeqIO.write(seqs, output_handle, "fasta")
    outfh.close()

def mapOldRefseqToNewRefseqIndex(seqType, faFname, taxId, dbmFname):
    " map old prot refseqs to new refseqs "
    # convert gb to fa - this route does not work - see below
    #raFname = join(geneDataDir, "oldRefseq.%d.ra" % taxId)
    #taFname = join(geneDataDir, "oldRefseq.%d.ta" % taxId)
    #cmd = "gbToFaRa -faInclVer /dev/null %(faFname)s %(raFname)s %(taFname)s %(oldRefseqFname)s" % locals()
    # gbToFaRa doesn't work - takes only the first version
    # gbtofa doesn't work - ignores peptides
    # gbtocdi doesn't work
    #runCommand(cmd)

    # create a pair file oldAcc, newAcc
    tmpDir    = join(pubConf.mapReduceTmpDir, "oldRefseqMap_"+seqType)
    makeOrCleanDir(tmpDir)
    oldAccs = constructOldRefseqAccs(taxId, seqType)
    pairFname = join(tmpDir, "oldRefseq.%s.pairs" % taxId)
    pairFh = open(pairFname, "w")
    for old, new in oldAccs.iteritems():
        pairFh.write("%s\t%s\n" % (old, new))
    pairFh.close()
    logging.debug("Wrote pairs to %s" % pairFname)

    # create the mapping psl file
    mapFname = join(geneDataDir, "oldRefseq.%d.%s.psl" % (taxId, seqType))
    if seqType=="prot":
        mapProtToRefseq(taxId, tmpDir, faFname)
    else:
        mapDnaToRefseq(taxId, tmpDir, faFname)

    filterPsls(tmpDir, pairFname, mapFname)
    loadPslToDbm(mapFname, dbmFname)

def constructOldRefseqAccs(taxId, seqType):
    refseqInfoFname =  join(geneDataDir, "refseqInfo.tab")
    logging.debug("Parsing %s" % refseqInfoFname)
    accs = []
    for row in maxCommon.iterTsvRows(refseqInfoFname):
        if seqType=="prot":
            accs.append(row.refProt)
        else:
            accs.append(row.refSeq)
    oldAccs = makeOldToNewAccs(accs)
    logging.info("Found %d old accessions" % len(oldAccs))
    return oldAccs

def downloadFromGenbank(oldAccs, oldRefseqFname, oneByOne=True, db="protein"):
    " download oldRefseq or genbank protein accessions "
    logging.info("Writing old refseq data to %s" % oldRefseqFname)
    outFh = open(oldRefseqFname, "w")
    pubEutils.downloadFromEutils(db, oldAccs, outFh, retType="gb", \
        retMax=1000, oneByOne=oneByOne)
    return oldRefseqFname

#def downloadFromGenbankFast(accs, outFname):
    #" download genbank protein accessions "
    #logging.info("Writing genbank data to %s" % outFname)
    #outFh = open(outFname, "w")
    #pubEutils.chunkedDownloadFromEutils("protein", accs, outFh, retType="gb", chunkSize=1000)

def writeUniprotEntrezSymGbLinks(taxId, uniprotDir):
    """ write a tab and .marshal file with the follwing links:
    - uniprot base ID -> list of uniprot isoforms
    - uniprot -> symbol
    - uniprot -> list of genbank
    - uniprot -> list of genbank prot
    - genbank -> uniprot
    - uniprot sequences
    """
    global geneDataDir
    assert(taxId==9606)
    data = {}
    data[taxId] = {}
    # parse uniprot seqs (get all variants)
    #faFname = join(uniprotDir, "uniprot.%s.var.fa.gz" % str(taxId))
    #seqDict = maxbio.parseFastaAsDict(faFname)
    #data[taxId]["upSeqs"] = seqDict
    #logging.info("Found %s sequences" % len(seqDict))

    # parse entrez -> uniprot id and up -> symbol and up->genbank
    #entrezToUp, upToSym, upToGb, upToRefseq = parseUniprot(uniprotDir, taxId)
    upData = parseUniprotLinks(uniprotDir, taxId)
    entrezToUp = upData["entrezToUp"]
    upToSym = upData["upToSym"]
    upToGbProts = upData["upToGbProts"]
    upToIsos = upData["upToIsos"]
    upToGbs = upData["upToGbs"]

    gbToUp = {}
    for up, gbList in upToGbs.iteritems():
        for gbId in gbList:
            gbToUp[gbId] = up

    # write to tab file
    mutDataFname = join(geneDataDir, "uniprot.tab")
    logging.info("Writing uniprot links to %s" % mutDataFname)
    ofh = open(mutDataFname,"w")
    ofh.write("geneId\tuniprotId\tuniprotIsoIds\tuniprotSym\tgbAcc\tuniprotGbProtAcc\n")
    noSym = 0
    for geneId, upIds in entrezToUp.iteritems():
        for upId in upIds:
            sym = upToSym.get(upId, None)
            if sym==None:
                sym=""
                noSym +=1

            gbAccs = ""
            if upId in upToGbProts:
                gbAccs = "|".join(upToGbs[upId])

            gbProtAccs = ""
            if upId in upToGbProts:
                gbProtAccs = "|".join(upToGbProts[upId])
            isoIds = upData["upToIsos"][upId]
            row = [str(geneId), upId, "|".join(isoIds), sym, gbAccs, gbProtAccs]
            ofh.write("\t".join(row)+"\n")

    ofh.close()
    logging.info("No sym: %d" % noSym)
    logging.info("Wrote to %s" % mutDataFname)

    # write to marshal file (faster and easier to parse)
    data[taxId] = upData
    data[taxId]["gbToUp"] = gbToUp
    geneDataDir = join(pubConf.staticDataDir, "mutFinder")
    mutDataFname = join(geneDataDir, "uniprot.tab.marshal")
    del data[taxId]["upToGbs"] # don't need these
    marshal.dump(data, open(mutDataFname, "w"))
    logging.info("Wrote to %s" % mutDataFname)

def cleanGbAccs(gbDict):
    " get only unique values from dict, flatten and clean "
    gbLists = gbDict.values()
    gbAccs = list(itertools.chain.from_iterable(gbLists))
    gbAccs = set([str(x) for x in gbAccs])
    if 'na' in gbAccs:
        gbAccs.remove("na")
    if '' in gbAccs:
        gbAccs.remove("")
    gbAccs = list(gbAccs)
    return gbAccs

def getGenbankAccs(taxId, outFname):
    " get all genbank accessions linked from uniprot for a given taxId, and write to file "
    upLinkFname = join(geneDataDir, "uniprot.tab.marshal")
    upData = marshal.load(open(upLinkFname))
    upToGbs = upData[taxId]["upToGb"]
    gbAccs = cleanGbAccs(upToGbs)

    ofh = open(outFname, "w")
    ofh.write("\n".join(gbAccs))
    ofh.close()
    logging.info("Wrote %d uniprot genbank DNA/RNA accs to %s" % (len(gbAccs), outFname))
    #return gbAccs


def getGenbankSeqs(gbIdFname, gbOutFname):
    " get genbank seqs with markd's tool "
    #cmd = "gbGetSeqs -accFile=%s -allowMissing -gbRoot=/hive/data/outside/genbank genbank mrna %s -get=seq -db=hg19 -inclVersion -native" % (gbIdFname, gbFname)
    gbFnames = glob.glob(join(genbankDir, "gbpri*.seq.gz"))
    open(gbOutFname, "w") # truncate gbOutFname
    for gbFname in gbFnames:
        cmd = "gbGetEntries -accFile=%s %s -missingOk >> %s" % (gbIdFname, gbFname, gbOutFname)
        runCommand(cmd)

def parseGenbankProts(gbFname, protFaFname, protIds):
    " returns translations of CDS in genbank file as dict protId -> sequence "
    logging.info("Parsing %s, trying to get %d prot ids" % (gbFname, len(protIds)))
    record_iterator = SeqIO.parse(gbFname, "genbank")
    res = {}
    protIds = set(protIds)
    for rec in record_iterator:
        cdsFts = [f for f in rec.features if f.type=='CDS']
        for cdsFt in cdsFts:
            if "protein_id" in cdsFt.qualifiers and "translation" in cdsFt.qualifiers:
                ftProtIds = cdsFt.qualifiers["protein_id"]
                assert(len(ftProtIds)==1)
                protId = ftProtIds[0]
                if protId in protIds:
                    protSeqs = cdsFt.qualifiers["translation"]
                    assert(len(protSeqs)==1)
                    res[protId] = protSeqs[0]
                else:
                    logging.debug("protId %s skipped, not target" % protId)
    logging.info("Got %d protein sequences from genbank file" % len(res))
    return res

def writeFa(data, outFname):
    logging.info("Writing %s" %outFname)
    ofh = open(outFname, "w")
    for id, seq in data.iteritems():
        ofh.write(">%s\n%s\n" % (id, seq))
    ofh.close()

def loadFaSeqs(protFaFname):
    " load (append) seqs in a fasta file into a dbm file as compressed sequences "
    dbmFname = join(geneDataDir, "seqs")
    #dbm = gdbm.open(dbmFname, "w")
    #dbm = pubKeyVal.SqliteKvDb(dbmFname, singleProcess=True, compress=True)
    dbm = pubKeyVal.SqliteKvDb(dbmFname, singleProcess=True)
    faToDbm(protFaFname, dbm)
    dbm.close()

def removeBrackets(s):
    """ remove nested brackets from string s
    >>> removeBrackets("hello world (no way)")
    'hello world'
    """
    newS = []
    brackLevel = 0
    for c in s:
        if c=="(":
            brackLevel+=1
            continue
        if c==")":
            brackLevel-=1
            continue
        if brackLevel==0:
            newS.append(c)
    return "".join(newS).strip()

def updateDict(big, small):
    " given two dicts -> list, add all entries from small to big "
    for key, vals in small.iteritems():
        big[key].update(vals)
    return big

# set with lowercase terms that shall not be added to the dictionary
blackList = set(['dynamin', 'superoxide dismutase', 'renal cell carcinoma', 'collagenase', 'intercellular adhesion molecule', 'hormone receptor', 'serine threonine kinase', 'differentiation', 'cyclin dependent kinase', 'epidermal growth factor', 'mitogen activated protein kinase', 'aquaporin', 'calmodulin', 'toll like receptor', 'transmembrane protein', 'adenylyl cyclase', 'poly adp ribose polymerase', 'heat shock protein', 'platelet derived growth factor', 'complement component', 'angiotensin ii', 'ifn gamma', 'adenylate cyclase', 'caspase 3', 'proteoglycan', 'g protein', 'aurora', 'polypeptide', 'atp binding cassette', 'insulin like', 'argonaute', 'neuronal differentiation', 'kinesin', 'tumor necrosis factor alpha', 'nf kappab', 'chloride channel', 'transforming growth factor beta', 'tropomyosin', 'growth hormone', 'serine protease', 'transferrin', 'receptor tyrosine kinase', 'cyclooxygenase 2', 'rna binding protein', 'dna ligase', 'proline rich', 'tyrosine kinase', 'calpain', 'glycoprotein', 'nadph oxidase', 'aspartate aminotransferase', 'interferon alpha', 'amyotrophic lateral sclerosis', 'neurofilament', 'claudin', 'fibronectin', 'estrogen receptor', 'metabotropic glutamate receptor', 'cell adhesion molecule', 'adiponectin', 'prostate cancer', 'interleukin 6', 'midline', 'thrombocytopenia', 'kallikrein', 'gtp binding protein', 'vascular endothelial growth factor', 'zinc finger protein', 'interferon gamma', 'endothelin', 'p glycoprotein', 'vimentin', 'peroxiredoxin', 'epidermal growth factor receptor', 'insulin like growth factor', 'zinc finger', 'neurotrophin', 'cysteine protease', 'splicing factor', 'integral membrane protein', 'glutathione s transferase', 'nitric oxide synthase', 'keratin', 'proto oncogene', 'atp synthase', 'heavy chain', 'glutamate receptor', 'potassium channel', 'death receptor', 'ribonuclease', 'glycogen synthase', 'angiotensin converting enzyme', 'trypsin', 'protein can', 'dynactin', 'thrombospondin', 'tnf alpha', 'transferrin receptor', 'histone deacetylase', 'calcium binding protein', 'aldolase', 'glutathione peroxidase', 'hydroxysteroid dehydrogenase', 'progesterone receptor', 'myosin heavy chain', 'alzheimer disease', 'protease inhibitor', 'a protein kinase', 'alanine aminotransferase', 'transglutaminase', 'beta catenin', 'c reactive protein', 'hepatocellular carcinoma', 'cadherin', 'albumin', 'programmed cell death', 'ubiquitin like', 'fibroblast growth factor', 'a kinase', 'carbonic anhydrase', 'parathyroid hormone', 'g protein coupled receptor', 'matrix metalloproteinase', 'parkinson disease', 'tumor necrosis factor', 'muscle specific', 'exonuclease', 'pierce', 'homeobox', 'microtubule associated protein', 'salvador', 'map kinase', 'tumor suppressor', 'lysozyme', 'interferon regulatory factor', 'syntaxin', 'c jun n terminal kinase', 'tetraspanin', 'modifier', 'signal transducer and activator of transcription', 'insulin receptor substrate', 'nucleolar protein', 'catenin', 'extracellular signal regulated kinase', 'osteosarcoma', 'f box protein', 'macrophage inflammatory protein', 'bone morphogenetic protein', 'aldehyde dehydrogenase', 'interleukin 2', 'protein tyrosine kinase', 'breast cancer cell', 'proteinase', 'abc transporter', 'thioredoxin', 'membrane bound', 'acute myeloid leukemia', 'interleukin', 'retinoblastoma', 'activator', 'dna methyltransferase', 'transcription factor', 'surface antigen', 'alcohol dehydrogenase', 'catalase', 'cytochrome c', 'neuraminidase', 'importin', 'cytochrome p450', 'cyclin d1', 'e cadherin', 'cytokeratin', 'mitogen activated protein kinase kinase', 'cell cycle progression', 'nuclear protein', 'hexokinase', 'nadh dehydrogenase', 'serum albumin', 'focal adhesion kinase', 'nerve growth factor'])

def cleanupNames(descToAccs):
    """
    given a dict with name -> list of accs, clean up the names by various means
    and separate the names into LONG names and SHORT symbols.
    Also keep track which symbols/names are not unique for a gene.

    return:
    - accToNames: dictionary with entrez gene to list of (long) gene names
    - symToSyns: dictionary with entrez gene to list of (short) gene synonyms
                 (for ambiguous symbols, can contain multiple IDs separated by a slash)
    - nonUniqueSyms: list of symbol synonyms that are used by several genes

    """
    # only the X most common English words
    bncCutoff = 5000
    bncWords = set(open(pubConf.bncFname).read().splitlines()[:bncCutoff])

    # add alternative names without greek symbols
    greekConv = {
        'alpha' : 'a',
        'beta' : 'b',
        'gamma' : 'g',
        'delta' : 'd',
        'epsilon' : 'e',
        'theta' : 't',
        'kappa' : 'k',
        'sigma' : 's',
    }

    newDescs = {}
    for name, accs in descToAccs.iteritems():
        newDescs[name] = accs
        newName = name
        for greek, nonGreek in greekConv.iteritems():
            newName = newName.replace(greek, nonGreek)
        if newName!=name:
            if len(newName)>4: # random cutoff to avoid really short new names
                newDescs[newName] = accs
    descToAccs = newDescs

    # split into symbols and names by length
    nameToAccs = defaultdict(set)
    symToAccs = defaultdict(set)

    # containers for removed gene names and symbol for logging purposes
    tooShortSyms = set()
    commonWords = set()
    blackListed = set()

    maxLen = 2
    for n, accs in descToAccs.iteritems():
        if n=="" or len(n)<=maxLen:
            tooShortSyms.add(n)
            continue
        # distinguish between gene symbols (short or all-uppercase) and gene names
        if len(n)<8 or (n.upper()==n and not " " in n and not "," in n):
            symToAccs[n].update(accs)
        else:
            # lowercase all gene names and remove bracketed stuff
            n = removeBrackets(n)
            n = n.replace("  ", " ")
            n = n.replace(" , ", ", ")
            nameToAccs[n.lower()].update(accs)

    # A) Clean up gene names
    # remove disease names, non-unique names and names that are too short
    # and create dict name -> acc string
    diseaseLikeNames = {}
    nonUniqueNames = {}
    tooShortNames = {}
    okNames = defaultdict(set)
    for name, accs in nameToAccs.iteritems():
        if len(accs)>1:
            nonUniqueNames[name] = accs
        elif name.endswith("disease") or name.endswith("syndrome"):
            diseaseLikeNames[name] = accs
        elif len(name)<7:
            tooShortNames[name] = accs
        else:
            okNames[str(name)] = accs
    nameToAcc = okNames

    # create ambiguous names without the final number
    # e.g. palmitoyl-protein thioesterase 2 and palmitoyl-protein thioesterase 1
    # add "palmitoyl-protein thioesterase" -> both accessions
    noNumbers = defaultdict(set)
    numCount = 0
    for name, accs in nameToAcc.iteritems():
        lastNum = name.split()[-1]
        if not lastNum.isdigit():
            continue
        if int(lastNum)>9:
            continue
        newName = " ".join(name.split()[:-1])
        if newName.lower() in bncWords:
            commonWords.add(newName)
            continue
        noNumbers[newName].update(accs)
        numCount +=1
    logging.info("Found %d names with trailing number, created %d new names without numbers" % \
        (numCount, len(noNumbers)))
    nameToAcc = updateDict(okNames, noNumbers)

    # reverse name dict
    accToNames = defaultdict(list)
    for name, accs in nameToAcc.iteritems():
        # split and join words in name like done in fastFind -> results in same form of the name as it appears later in lex -> helps filter out all blacklisted words
        words = fastFind.WORDRE.findall(name)
        joinedWords = ' '.join(words)
        # remove common words (in BNC) and blacklisted words
        if joinedWords.lower() in bncWords:
            commonWords.add(name)
            continue
        if joinedWords.lower() in blackList:
            logging.debug("name %s is blacklisted" % n)
            blackListed.add(name)
            continue
        accToNames["/".join(accs)].append(name)

    # B) Clean up gene symbols
    # symbol table is: synonym -> list of official symbols
    # ERV46 -> set('ERGIC3')
    # now inverse this symbol table
    # for non-unique symbols, add all possible symbols as /-separated
    # (slash is not a valid character in official hgnc symbols)
    nonUniqueSyms = {}
    symToSyns = defaultdict(list)
    for name, accs in symToAccs.iteritems():
        # find non-unique symbols
        if len(accs)>1:
            nonUniqueSyms[name] = accs
        else:
            symStr = "/".join(accs)
            #if altSym not in offSyms and altSym!="":
            caseSyns = set()
            # add a mixed-case and an uppercase version of all symbols
            # e.g. CDK1 -> Cdk1
            caseSyns.add(name)
            # the uppercase
            caseSyns.add(name.upper())
            # the mix case
            if name[0].isalpha():
                mixSyn = name[0].upper()+name[1:].lower()
                caseSyns.add(mixSyn)

            # remove common and blacklisted symbols from dictionary
            caseSynsClean = set()
            for name in caseSyns:
                if name.lower() in bncWords:
                    commonWords.add(name)
                    continue
                if name.lower() in blackList:
                    logging.debug("name %s is blacklisted" % name)
                    blackListed.add(name)
                    continue
                caseSynsClean.add(name)

            symToSyns[symStr].extend(caseSynsClean)

    logging.info("Removed black listed: %s" % ",".join(blackListed))
    logging.info("Removed very short symbols: %s" % ",".join(tooShortSyms))
    logging.info("Removed common English words: %s" % ",".join(commonWords))
    logging.info("Found %d names for %d accs" % (len(nameToAcc), len(accToNames)))
    logging.info("Found %d symbols for %d official symbols" % (len(symToAccs), len(symToSyns)))
    logging.info("Removed %d non-unique names" % len(nonUniqueNames))
    logging.info("Removed %d too-short names" % len(tooShortNames))
    logging.info("Removed %d names that look like disease names" % len(diseaseLikeNames))
    return accToNames, symToSyns, nonUniqueSyms

def parseUniprotNames(taxIds, uniprotDir, nameToAccs, hgncSyms):
    " return dict with longname -> uniprot IDs and dict with entrez -> uniprot"
    #uniprotFname = join(uniprotDir, "uniprot.tab")
    uniprotFname = join(uniprotDir, "uniprot.9606.tab")
    logging.info("Parsing %s" % uniprotFname)
    #entrezToUniprot = defaultdict(set)
    #nameToAccs = defaultdict(set)
    upIgnored = []
    dupSyms = []
    for row in maxCommon.iterTsvRows(uniprotFname):
        if int(row.taxonId) not in taxIds:
            continue
        # skip if no symbol
        names = []
        acc = row.acc
        # remove illegal hgnc symbols
        syms = set(row.hgncSym.split("|"))
        syms = syms.intersection(hgncSyms)
        if len(syms)==0:
            upIgnored.append(acc)
            continue

        if len(syms)>1:
            dupSyms.append((acc, syms))
            continue
        sym = list(syms)[0]

        # get all names
        # dataset acc     mainIsoAcc      orgName orgCommon       taxonId name    accList protFullNames   protShortNames  protAltFullNames       protAltShortNames       geneName        geneSynonyms    isoNames        geneOrdLocus    geneOrf hgncSym hgncId  refSeq  refSeqProt      ncbiGene        ensemblGene     ensemblProt     emblMrna        emblMrnaProt  emblDna  emblDnaProt     pdb     ec      uniGene omimGene        omimPhenotype   isoIds  isoSeqs
        names.extend(row.protFullNames.split("|"))
        names.extend(row.protShortNames.split("|"))
        names.extend(row.protAltFullNames.split("|"))
        names.extend(row.protAltShortNames.split("|"))
        # symbols
        names.extend(row.geneName.split("|"))
        names.extend(row.geneSynonyms.split("|"))
        for name in names:
            for sym in syms:
                nameToAccs[name].add(sym)

        #entrezIds = set([int(x) for x in row.ncbiGene.split("|") if x!=""])
        #entrezIds = entrezIds.intersection(goodEntrezIds)
        #for eid in entrezIds:
            #entrezToUniprot[int(eid)].add(acc)

    logging.info("%d uniprot records ignored because of no valid hgnc symbol" % len(upIgnored))
    logging.info("%d uniprot records ignored because of >1 hgnc symbol" % len(dupSyms))
    logging.info("total name count: %d" % len(nameToAccs))
    return nameToAccs


def parseEntrezNames(taxIds, symToEntrez, nameToAccs):
    " fill dict with name -> official symbols"
    headers = "tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from_nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_date".split()
    fname = join(pubConf.ncbiGenesDir, "gene_info.9606.gz")
    logging.info("Parsing %s" % fname)
    data = {}
    likeSym = defaultdict(list)
    entrezSpecific = set()
    phenoOnly = set()
    #nameToAccs = defaultdict(set)
    # example line:
    # 9606    5308    PITX2   -       ARP1|Brx1|IDG2|IGDS|IGDS2|IHG2|IRID2|Otlx2|PTX2|RGS|RIEG|RIEG1|RS       HGNC:9005|MIM:601542|Ensembl:ENSG00000164093|HPRD:03328|Vega:OTTHUMG00000132837        4       4q25    paired-like homeodomain 2     protein-coding   PITX2   paired-like homeodomain 2       O       ALL1-responsive protein ARP1|all1-responsive gene 1|homeobox protein PITX2|paired-like homeodomain transcription factor 2|pituitary homeobox 2|rieg bicoid-related homeobox transcription factor 1|solurshin  20130416
    for row in maxCommon.iterTsvRows(fname, headers=headers, skipLines=1):
        if int(row.tax_id) not in taxIds:
            continue
        names = []
        sym = row.Symbol
        if sym not in symToEntrez:
            #logging.info("Entrez specific symbol %s" % sym)
            entrezSpecific.add(sym)
            continue
        if row.type_of_gene=="unknown":
            phenoOnly.add(sym)
            continue

        acc = row.GeneID
        #upAccs = entrezToUp.get(int(acc), None)
        #if upAccs==None:
            #continue
        # symbols
        if row.Synonyms!="":
            names.extend(row.Synonyms.split("|"))
        # long names
        if row.description!="":
            names = [row.description]
            names = [x for x in names if " " in x]
        if row.Other_designations!="":
            altNames = row.Other_designations.split("|")
            altNames = [x for x in altNames if x!="-"]
            names.extend(altNames)

        for n in names:
            #assert("," not in n)
            nameToAccs[n].add(sym)

    logging.info("Ignored %d entrez-specific symbols" % len(entrezSpecific))
    logging.info("Ignored %d phenotype-only symbols" % len(phenoOnly))
    return nameToAccs

def splitQuote(name, isSym=False):
    """ try to split quoted names on , """
    if '"' in name:
        # first split quoted names
        names = name.split('", "')
    else:
        # if there are no quotes, also try to split on just commas
        names = name.split(",")
    names = [n.strip('"') for n in names]
    names = [n.strip() for n in names]
    names = [n for n in names if n!=""]
    names = [unidecode.unidecode(n) for n in names]
    # make sure there are no commas left, if symbol
    if isSym:
        for n in names:
            assert("," not in n)
    return names

def parseHgncNames(data):
    " fill dict with mapping gene name -> official symbols"
    #fname = join(pubConf.hgncDir, "hgnc.txt")
    fname = pubConf.hgncFname
    logging.info("Parsing %s" % fname)
    #data = defaultdict(set)
    symToEntrez = {}
    noEntrez = set()
    # headers:
    # HGNC_ID Approved_Symbol Approved_Name   Status  Locus_Type      Locus_Group     Previous_Symbols        Previous_Names Synonyms Name_Synonyms   Chromosome      Date_Approved   Date_Modified   Date_Symbol_Changed     Date_Name_Changed     Accession_Numbers        Enzyme_IDs      Entrez_Gene_ID  Ensembl_Gene_ID Mouse_Genome_Database_ID        Specialist_Database_Links      Specialist_Database_IDs Pubmed_IDs      RefSeq_IDs      Gene_Family_Tag Gene_family_description Record_Type    Primary_IDs     Secondary_IDs   CCDS_IDs        VEGA_IDs        Locus_Specific_Databases        GDB_ID_(mapped_data)   Entrez_Gene_ID_(mapped_data_supplied_by_NCBI)   OMIM_ID_(mapped_data_supplied_by_NCBI)  RefSeq_(mapped_data_supplied_by_NCBI)  UniProt_ID_(mapped_data_supplied_by_UniProt)    Ensembl_ID_(mapped_data_supplied_by_Ensembl)    UCSC_ID_(mapped_data_supplied_by_UCSC) Mouse_Genome_Database_ID_(mapped_data_supplied_by_MGI)  Rat_Genome_Database_ID_(mapped_data_supplied_by_RGD)
    phenoOnly = set()
    for row in maxCommon.iterTsvRows(fname, makeHeadersUnique=True, encoding="latin1"):
        if "withdrawn" in row.Approved_Name:
            continue
        entrez = row.Entrez_Gene_ID
        sym = row.Approved_Symbol
        if row.Locus_Type=="phenotype only":
            phenoOnly.add(sym)
            continue

        if entrez=="":
            #entrez = row.Entrez_Gene_ID_2
            entrez = row.Entrez_Gene_ID__supplied_by_NCBI_

        if entrez=="":
            noEntrez.add(sym)
            continue

        entrez = int(entrez)
        symToEntrez[sym]=entrez

        names = [sym]
        names.append(row.Approved_Name)
        names.extend(splitQuote(row.Previous_Names))
        names.extend(splitQuote(row.Name_Synonyms))
        # symbols
        #names.extend(row.Previous_Symbols.split(", "))
        #names.extend(row.Synonyms.split(", "))
        names.extend(splitQuote(row.Previous_Symbols, isSym=True))
        names.extend(splitQuote(row.Synonyms, isSym=True))

        for n in names:
            data[n].add(sym)

    logging.info("Ignored %d HGNC symbols with only phenotype" % (len(phenoOnly)))
    logging.info("Ignored %d HGNC symbols without entrez ID" % (len(noEntrez)))
    logging.info("Read %d names" % len(data))
    logging.info("Got %d symbol->entrez mappings for %d entrez IDs" % (len(symToEntrez), len(set(symToEntrez.values()))))
    return data, symToEntrez

symBlackList = set(['SDS'])

def readDict(fname, symToEntrez, nameDict, justPass=None, ambiguous=None):
    " parse dict.gz file, resolve symbols to entrezIds and optionally prefix with ? "
    logging.info("Loading %s" % fname)
    notFound = 0
    nameCount = 0
    geneCount = 0
    blackListCount = 0
    for line in gzip.open(fname):
        symStr, nameStr = line.strip("\n").split("\t")
        syms = symStr.split("/")
        eIds = []
        for sym in syms:
            if sym in symBlackList:
                blackListCount +=1
                continue
            eIds.append(str(symToEntrez[sym]))

        isAmb, isPass = False, False
        entrezId = "/".join(eIds)

        geneCount += 1
        for name in nameStr.split("|"):
            prefix = ""
            if justPass is not None:
                if name in justPass:
                    isPass=True
                if name in ambiguous:
                    isAmb = True
                    prefix = "?"
                if not isPass and not isAmb:
                    #logging.warning("name %s is neither pass nor ambiguous, skipping" % str(syms))
                    notFound += 1

            nameDict[prefix+entrezId].add(name)
            nameCount +=1
    logging.info("%d symbols were blacklisted" % blackListCount)
    logging.info("%d genes, %s names (names not found in filters: %d)" % (geneCount, nameCount, notFound))
    return nameDict

def readList(fname):
    logging.info("Reading %s" % fname)
    for line in open(fname):
        yield line.strip().split()[0]

def makeLociNames():
    """ split human genome into separate regions around genes and assign them to a gene symbol """
    hgncFn = pubConf.hgncFname
    symFname = join(pubConf.getTempDir(), "allSym.txt")
    filterSymCmd = "cut -f2 %(hgncFn)s | grep -v withdrawn > %(symFname)s" % locals()
    maxCommon.runCommand(filterSymCmd)

    exonFname = join(pubConf.getTempDir(), "exons.bed")
    cmd = """
    hgsql hg19 -NB -e 'select geneSymbol, knownGene.* from knownGene JOIN knownCanonical ON (name=transcript) JOIN kgXref ON (name=kgId);' |
    grep -v hap | grep -v _gl | lstOp filter stdin %(symFname)s |
    tawk '{$2=$1; print}' | cut -f2- | cut -f-10 |
    genePredToBed stdin stdout | bedToExons stdin stdout |
    tawk '($4~/^[A-Z0-9]+$/)' |
    sort -k1,1 -k2,2n > %(exonFname)s
    """ % locals()
    # remove haplotype chromosomes
    # final awk removes -AS and fusion gene symbols
    maxCommon.runCommand(cmd)

    lociFname = join(geneDataDir, "loci.bed")
    chromSizesFname = join(pubConf.genomeDataDir, "hg19", "chrom.sizes")
    outf = open(lociFname, "w")
    mids = bedLoci.parseBedMids(open(exonFname))
    chromSizes = bedLoci.slurpdict(chromSizesFname)
    bedLoci.outputLoci(mids, chromSizes, outf)
    outf.close()
    logging.info("Loci coords and names written to %s" % lociFname)

def overlapBeds(selectFname, inFname):
    " overlap in with select and return a dict inBedNames -> list of selectFnames "
    tempFn = join(pubConf.getTempDir(), "overlapMerged.tab")
    cmd = "overlapSelect %(selectFname)s %(inFname)s %(tempFn)s -mergeOutput -overlapThreshold=0.5" \
        % locals()
    maxCommon.runCommand(cmd)
    inToSel = defaultdict(set)
    for l in open(tempFn):
        fs = l.strip().split("\t")
        inId = fs[3]
        selId = fs[-1]
        inToSel[inId].add(selId)
    return inToSel

def makeBandLoci():
    " extract cytogenetic bands from human genome and assign it to gene symbols via lociNames "
    humanBandFn = join(pubConf.markerDbDir, "hg19.band.bed")
    lociFn = join(pubConf.geneDataDir, "loci.bed")
    geneToBand = overlapBeds(humanBandFn, lociFn)
    bandToGenes = defaultdict(set)
    for gene, bands in geneToBand.iteritems():
        for b in bands:
            bandToGenes[b].add(gene)

    # convert to tab and dict band -> entrezIds
    symToEntrez = marshal.loads(gzip.open(join(geneDataDir, "hgnc.symToEntrez.marshal.gz")).read())
    bandLociTab = join(pubConf.geneDataDir, "bandGenes.tab")
    bandToEntrezList = {}
    ofh = open(bandLociTab, "w")
    for band, syms in bandToGenes.iteritems():
        entrezList = []
        entrezToSym = {}
        for sym in syms:
            #entrezList = [str(symToEntrez[g]) for g in genes]
            if sym not in symToEntrez:
                logging.warn("no entrez ID for symbol %s" % sym)
            else:
                eId = symToEntrez[sym]
                entrezList.append(str(eId))
                entrezToSym[eId] = sym

        row = [band, "|".join(entrezList), "|".join(syms)]
        bandToEntrezList[band] = entrezToSym
        ofh.write("\t".join(row))
        ofh.write("\n")
    ofh.close()

    logging.info("band-gene mapping written as table to %s" % bandLociTab)

    #ascData = marshal.dumps(bandToEntrezList)
    #outFname = join(pubConf.geneDataDir, "bandToEntrez.marshal.gz")
    #gzip.open(outFname, "w").write(ascData)
    #logging.info("band->entrez mapping written as binary to %s" % outFname)

def makeGeneDict():
    """ create fastFind dict for genes:
    long names AND unambigous AND rare symbols are taken as they are
    short and ambiguous symbols are prefixed with "?"
    Also convert all HGNC symbols to entrez IDs

    Write two versions of the dict files, normal case and all lowercased.
    """
    fname = join(geneDataDir, "hgnc.symToEntrez.marshal.gz")
    logging.info("Loading %s" % fname)
    symToEntrez = marshal.loads(gzip.open(fname).read())

    # read and translate long gene names to entrez
    nameDict = defaultdict(set)
    fname = join(geneDataDir, "geneNames.hgnc.dict.gz")
    readDict(fname, symToEntrez, nameDict)
    fname = join(geneDataDir, "geneNames.dict.gz")
    writeGzDict(nameDict, fname)
    fastFind.compileDict(fname)
    # not ignoring case anymore
    #fnameLow = fname.replace(".dict.gz", "Low.dict.gz")
    #writeGzDict(nameDict, fnameLow, toLower=True)
    #fastFind.compileDict(fnameLow)

    # read symbol classification into two sets
    nameDict = defaultdict(set)
    ambSymbols = set()
    fname = join(geneDataDir, "ambiguousSymbols.txt")
    ambSymbols.update(readList(fname))
    fname = join(geneDataDir, "rareSymbols.txt")
    ambSymbols.update(readList(fname))
    logging.info("Got %d ambiguous symbols " % len(ambSymbols))

    goodSymbols = set()
    fname = join(geneDataDir, "easySymbols.txt")
    goodSymbols.update(readList(fname))
    logging.info("Got %d un-ambiguous symbols " % len(goodSymbols))

    # parse symbol dict using the two sets
    fname = join(geneDataDir, "symbols.hgnc.dict.gz")
    readDict(fname, symToEntrez, nameDict, ambiguous=ambSymbols, justPass=goodSymbols)

    #bandLociFname = join(geneDataDir, "bandGenes.dict")
    fname = join(geneDataDir, "symbols.dict.gz")
    #writeGzDict(nameDict, fname, appendFnames={bandLociFname:"*"})
    writeGzDict(nameDict, fname)
    fastFind.compileDict(fname)
    #fnameLow = fname.replace(".dict.gz", "Low.dict.gz")
    #writeGzDict(nameDict, fnameLow, toLower=True)
    #fastFind.compileDict(fnameLow)

def mapRefseqOldToNewLoad(seqType, taxId):
    " create psl from old to new refseqs and load into dbm, load old refseqs into seq dbm "
    oldRefseqFname = join(varBuildDir, "oldRefseq.%d.%s.gb" % (taxId, seqType))
    pslDbmFname = join(varDataDir, "oldRefseqToRefseq.%d.%s.psl" % (taxId, seqType))
    faFname = join(geneDataDir, "oldRefseq.%d.%s.fa" % (taxId, seqType))
    gbToFa(oldRefseqFname, faFname)
    mapOldRefseqToNewRefseqIndex(seqType, faFname, taxId, pslDbmFname)
    loadFaSeqs(faFname)

def downloadToTmp(url, tmpFname):
    if not isfile(tmpFname):
        logging.info("Downloading %s to %s" % (url, tmpFname))
        cmd = "wget %s -O %s" % (url, tmpFname)
        maxCommon.runCommand(cmd)
        maxCommon.delOnExit(tmpFname)

def makeCellNames():
    " parse cellosaurus to get cell line names, need to copy/paste into source file "
    url = "ftp://ftp.nextprot.org/pub/current_release/controlled_vocabularies/cellosaurus.txt"
    tmpFname = join(pubConf.getTempDir(), "cellosaurus.txt")
    downloadToTmp(url, tmpFname)

    names = []
    for line in open(tmpFname):
        if line.startswith("ID"):
            name = line.strip()[5:]
            names.append(name)
        if line.startswith("SY"):
            syns = line.strip()[5:]
            syns = syns.split("; ")
            for s in syns:
                names.append(s)

    likeMuts = []
    mutRe = re.compile("^([ABCDEFGHIKLMNPQRSTVWXY])([0-9]{1,5})([ABCDEFGHIKLMNPQRSTVWXY])$")
    for n in names:
        m = mutRe.match(n)
        if m!=None:
            let1, num, let2 = m.groups()
            num = int(num)
            likeMuts.append( (let1, num, let2) )
    print(likeMuts)

# ========= IMPORTING ACCESSIONS =======

def getLivelistUrl():
    """
    return most current genbank livelist url
    >>> u = livelistUrl()
    >>> # check if url is valid
    >>> urllib2.urlopen(u).url[-1]
    'z'
    """
    d = date.today().toordinal()
    last = d - 6
    sunday = last - (last % 7)
    #saturday = sunday + 6
    sunday = date.fromordinal(sunday)
    url = "http://ftp.ncbi.nih.gov/genbank/livelists/GbAccList.%02d%02d.%d.gz" % (sunday.month, sunday.day, sunday.year)
    return url

def indexAccsGenbank():
    " parse the current genbank livelist and write a file with the number of digits for each prefix "
    #liveListFname = "/hive/data/outside/genbank.133/GbAccList.0621.2015.gz"
    liveListUrl = getLivelistUrl()
    tmpFname = join(pubConf.getTempDir(), "GbAccList.gz")
    logging.info("Downloading %s to %s and parse, takes a few hours" % liveListUrl, tmpFname)
    downloadToTmp(liveListUrl, tmpFname)

    # parse input file
    accLens = defaultdict(set)
    accs = []
    i = 0
    for line in gzip.open(tmpFname):
        i +=1
        if i%100000==0:
            print("Read %d accessions" % i)
            accs = []
        acc = line.split(",")[0]
        accChars, accDigits, _ = geneFinder.splitGenbankAcc(acc)
        accLens[accChars].add(len(accDigits))

    # now write to text file
    dbInfoFname = join(pubConf.accDataDir, "genbankFormat.txt")
    ofh = open(dbInfoFname, "w")
    for prefix, digits in accLens.iteritems():
        row = [prefix, "|".join([str(x) for x in digits])]
        ofh.write("\t".join(row))
        ofh.write("\n")
    ofh.close()
    logging.info("Wrote genbank format for %d accessions to %s" % (i, dbInfoFname))

def indexAccsUniprot():
    " read uniprot ID file and write accessions to sqlite database "
    # cannot use this file, it doesn't include old accessions, only the current ones
    #idUrl = "/hive/data/outside/uniProtCurrent/idmapping_selected.tab.gz"
    #idFname = join(pubConf.getTempDir(), "uniprotIdMap.tab.gz")
    #downloadToTmp(idUrl, idFname)

    dbFname = join(pubConf.accDataDir, "uniprot")

    db = pubKeyVal.SqliteKvDb(dbFname, singleProcess=True, newDb=True, \
        tmpDir=pubConf.getFastTempDir())
    logging.info("writing to %s, a tempfile on ram disk" % db.dbName)
    lastAcc = None
    lastChunk = 0
    fnames = ["uniprot.all.tab", "uniprotTrembl.all.tab"]
    #fnames = ["uniprot.9606.tab"]
    doneAccs = set()
    for fname in fnames:
        path = join(pubConf.dbRefDir, fname)
        logging.info("Reading %s" % path)
        for row in maxCommon.iterTsvRows(path):
            accs = set(row.accList.split("|"))
            refs = row.refSeq
            for acc in accs:
                if acc in doneAccs:
                    continue
                db[acc]= refs
            doneAccs.update(accs)
            if len(doneAccs)/50000!=lastChunk:
                print("%d accessions done" % len(doneAccs))
                lastChunk = len(doneAccs)/50000
    db.close()
    logging.info("wrote %d accessions to %s.sqlite" % (len(doneAccs), dbFname))

def indexAccsPdb():
    " download list of PDB accessions into a key-val tab file, adding obsolete accessions, format oldAcc->newAcc "
    outFname = join(pubConf.accDataDir, "pdb.txt")
    ofh = open(outFname, "w")
    url = "http://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt"
    i = 0
    for line in urllib2.urlopen(url):
        acc = line.split()[0]
        ofh.write("%s\t%s\n" % (acc, acc))
        i+=1

    url = "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat"
    for line in urllib2.urlopen(url):
        if "LIST OF" in line:
            continue
        oldAcc, newAcc = line.split()[-2:]
        ofh.write("%s\t%s\n" % (oldAcc.lower(), newAcc.lower()))
        i+=1

    ofh.close()
    logging.info("wrote %d accessions to %s" % (i, outFname))

def indexAccsSts():
    " NOT USED: index STS accessions from UCSC db hg19 "
    sql = "select alias from stsAlias"
    tmpFname = join(pubConf.getTempDir(), "sts.txt")
    maxCommon.delOnExit(tmpFname)
    cmd = "hgsql hg19 -NB -e '%s' > %s" % (sql, tmpFname)
    maxCommon.runCommand(cmd)

    dbPath = join(pubConf.accDataDir, "sts")
    db = pubKeyVal.SqliteKvDb(dbPath, singleProcess=True, newDb=True, \
        tmpDir=pubConf.getFastTempDir())
    i = 0
    doneIds = set()
    for line in open(tmpFname):
        acc = line.rstrip("\n")
        if acc in doneIds:
            continue
        doneIds.add(acc)
        db[acc] = ""
        i+=1
    db.close()
    logging.info("Wrote %d sts accessions to %s" % (i, db.dispName()))


def main(args, options):
    pubGeneric.setupLogging(progFile, options)
    step = args[0]
    uniprotDir = pubConf.dbRefDir

    global geneDataDir
    geneDataDir = pubConf.geneDataDir
    if not isdir(geneDataDir):
        os.makedirs(geneDataDir)

    taxId = 9606
    taxIds = set([9606])

    if step not in allSteps:
        logging.error("Unknown step %s" % step)
        logging.info("Available steps are: %s" % ",".join(allSteps))
        sys.exit(1)

    if step=="genePmids" or step=="all":
        # parse entrez gene into dict pmid -> list of entrez ids
        # we also create a marshal file for jython
        entrezFname = join(geneDataDir, "pmid2entrez")
        #dataFname = join(geneDataDir, "pmid2entrez.marshal.gz")
        #dbm = gdbm.open(entrezFname, "nf")
        dbm = pubKeyVal.SqliteKvDb(entrezFname, singleProcess=True, newDb=True, \
            tmpDir=pubConf.getFastTempDir())
        parseEntrezGenePmids(taxIds, dbm)
        dbm.close()
        #open(dataFname, "w").write(zlib.compress(marshal.dumps(data)))

        logging.info("Wrote pmid -> entrez to %s" % (entrezFname))
        #dbm2.close()

    # ==== VARIANT DATA =====

    if step=="entrez" or step=="all":
        # entrez -> sym + refseq + refseqProt
        outFname = join(geneDataDir, "entrez.tab")
        parseEntrezGeneRefseq(outFname)

    if step=="refseq" or step=="all":
        # refseq ID -> refseqProt ID + cdsStart
        parseRefseq(taxId, varDataDir)

    if step=="uniprot" or step=="all":
        # uniprot -> uniprot isoforms + genbank, genbankProt, symbol
        writeUniprotEntrezSymGbLinks(taxId, uniprotDir)

    if step=="refseqMap" or step=="all":
        " refseq -> hg19 psls "
        refseqMap(taxId, varDataDir)

    if step=="uniprotMap" or step=="all":
        # create PSLs uniprot -> refseq
        upFaName  = join(uniprotDir, "uniprot.%d.var.fa.gz" % taxId)
        mapProtToRefseqIndex(taxId, "uniprot", upFaName, uniprotDir)

    # these two go together, but the first one takes really long, ~1 hour
    if step=="oldRefseqProtGet":
        oldRefseqFname = join(varBuildDir, "oldRefseq.%d.prot.gb" % taxId)
        oldAccs = constructOldRefseqAccs(taxId, "prot")
        downloadFromGenbank(oldAccs.keys(), oldRefseqFname)

    if step=="oldRefseqProtMap":
        mapRefseqOldToNewLoad("prot", taxId)

    # same as above, but for DNA
    if step=="oldRefseqGet":
        oldRefseqFname = join(geneDataDir, "oldRefseq.%d.dna.gb" % taxId)
        oldAccs = constructOldRefseqAccs(taxId, "dna")
        downloadFromGenbank(oldAccs.keys(), oldRefseqFname, db="nucleotide")
    if step=="oldRefseqMap":
        mapRefseqOldToNewLoad("dna", taxId)


    # requires the "uniprot" step
    if step=="genbankProtGet":
        gbIdFname = join(geneDataDir, "genbankUniprotIds.%d.txt" % taxId)
        gbFname = join(geneDataDir, "genbank.%d.gb" % taxId)
        getGenbankAccs(taxId, gbIdFname)
        getGenbankSeqs(gbIdFname, gbFname)
        #downloadFromGenbankFast(accs, gbFname) doesn't work

    if step=="genbankProtParse":
        gbFname = join(geneDataDir, "genbank.%d.gb" % taxId)
        #dnaFaFname = join(geneDataDir, "genbank.%d.dna.fa" % taxId)

        upData = parseUniprotLinks(uniprotDir, taxId)
        gbProtIds = upData["upToGbProt"]
        gbProtIds = cleanGbAccs(gbProtIds)

        # genbank to fasta
        protFaFname = join(geneDataDir, "genbank.%d.prot.fa" % taxId)
        prots = parseGenbankProts(gbFname, protFaFname, gbProtIds)
        writeFa(prots, protFaFname)

        # add seqs to dbm (duplicates?)
        loadFaSeqs(protFaFname)

    if step=="genbankProtMap":
        protFaFname = join(geneDataDir, "genbank.%d.prot.fa" % taxId)
        mapProtToRefseqIndex(taxId, "genbank", protFaFname, uniprotDir, stripVersion=True)

    if step=="seqs":
        loadSeqs(taxId, uniprotDir)

    if step=="snp":
        " mapping coordinates -> rsId and the reverse"
        # gdbm gave malloc errors if not done in two separate steps
        dbSnp(taxId, varDataDir)

    if step=="geneNames":
        # create dict with mapping gene name -> official symbols
        nameDict = defaultdict(set)
        nameDict, symToEntrez = parseHgncNames(nameDict)
        nameDict = parseUniprotNames(taxIds, uniprotDir, nameDict, symToEntrez)
        nameDict = parseEntrezNames(taxIds, symToEntrez, nameDict)

        accToNames, symToSyns, ambigousSyms  = cleanupNames(nameDict)

        dictFname = join(geneDataDir, "geneNames.hgnc.dict.gz")
        writeGzDict(accToNames, dictFname)

        symDictFname = join(geneDataDir, "symbols.hgnc.dict.gz")
        writeGzDict(symToSyns, symDictFname)

        fname = join(geneDataDir, "hgnc.symToEntrez.marshal.gz")
        fh = gzip.open(fname, "w")
        fh.write(marshal.dumps(symToEntrez))
        fh.close()
        logging.info("Wrote %s" % fname)

    if step=="lociNames":
        makeLociNames()

    if step=="bandLoci":
        makeBandLoci()

    if step=="geneDict":
        makeGeneDict()

    if step=="cells":
        makeCellNames()

    if step=="accsGenbank":
        indexAccsGenbank()

    if step=="accsUniprot":
        indexAccsUniprot()

    if step=="accsPdb":
        indexAccsPdb()

    #if step=="accsSts":
        #indexAccsSts()

    #if step=="ncbi" or step=="all":
        #entrezTabFname = join(pubConf.ncbiGenesDir, "gene2refseq.gz")
        #logging.info("Parsing gene<->refseq ids out of %s" % entrezTabFname)
        #rnaIds = []
        #protIds = []
        #for line in gzip.open(entrezTabFname):
        #    if line.startswith("#"):
        #        continue
        #    fields = line.rstrip("\n").split("\t")
        #    if int(fields[0])!=taxId:
        #        continue
        #    rnaId = fields[3]
        #    protId = fields[5]
        #    rnaIds.append("*%s*"%rnaId)
        #    protIds.append("*%s*" % protId)

        #rnaIds = list(set(rnaIds))
        #protIds = list(set(protIds))
        #logging.info("RefSeq: Got %d rnaIds, %d protIds" % (len(rnaIds), len(protIds)))

        # get the genbank acc # of all uniprot sequences
        #entrezToUp, upToSym, upToGb = parseUniprot(uniprotDir, taxId)
        #gbIds = set()
        #for gbList in upToGb.values():
        #    gbIds.update(gbList)
        #gbIds = list(gbIds)

        #idFname = "/tmp/ncbiIds.txt"

        #ofh = open(idFname, "w")
        #ofh.write("\n".join(protIds))
        #ofh.write("\n")
        #ofh.write("\n".join(rnaIds))
        #ofh.write("\n")
        #ofh.write("\n".join(gbIds))
        #ofh.close()


if options.test:
    import doctest
    doctest.testmod()
    sys.exit(0)

if len(args)==0:
    parser.print_help()
    exit(1)

main(args, options)