In [1]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
import sys
import json
from DB_setup import Protein_sequence, DBD, Experiment, PFM, TF_info, Base, DBD_TF_Maps
from sqlalchemy.orm import sessionmaker


#setting up database directory, database will be named tf.db for now
db_directory = 'C:\\Users\\Alex\\Documents\\fordyce_rotation\\tf2.db'

#setting up engine to connect to the database, using sqlite
engine = create_engine('sqlite:///'+db_directory)

Base.metadata.create_all(engine)

jaspar_file, uniprobe_file = ['jaspar_success.json', 'uniprobe_final.json']

with open(jaspar_file ,'r') as jasp:
    jaspar = json.loads(jasp.read())

with open(uniprobe_file ,'r') as uni:
    uniprobe = json.loads(uni.read())

import re
protein_seqs = []
experiments = []
pfms = []
protein_info = []
dbds = []

Session = sessionmaker(bind=engine)
session = Session()

for tf, info in jaspar.items():
    protein_name = '_'.join(tf.split('::'))
    sequence = info['sequence']
    if not sequence:
        sequence = None
    protein_seq_to_add = Protein_sequence(protein_name, sequence)
    
    
    uniprot_ids = str('_'.join(info['acc']))
    protein_infor = TF_info( tf_class = info['class'], family = info['family'],\
                                species = str(info['species']), uniprot = uniprot_ids)

    exp = Experiment(pubmed_id = info['pubmed'], assay = info['type'])
    protein_seq_to_add.info = [protein_infor]
    protein_seq_to_add.experiments = [exp]
    
    if info['dbds']:
        for dbd in info['dbds']:
            if 'domain_sequence' in dbd:
                dbd_seq = dbd['domain_sequence']
            else:
                dbd_seq = None
    
            dbd_to_add = DBD(dbd_seq, dbd['dbd_type'])
            link = DBD_TF_Maps(protein_seq_to_add, dbd_to_add)
            session.add(link)
            protein_seq_to_add.protein_dbd_maps.append(link)
            dbd_to_add.protein_dbd_maps.append(link)
            session.add(dbd_to_add)
    
    session.add(protein_infor)
    
    A, C, G, T, _ = info['motif'].split('\n')
    A = re.findall(r'[\d]+.[\d]+', A)
    C = re.findall(r'[\d]+.[\d]+', C)
    G = re.findall(r'[\d]+.[\d]+', G)
    T = re.findall(r'[\d]+.[\d]+', T)
    for pos in range(len(A)):
        pfmpos = PFM(pos, float(A[pos]),\
                        float(C[pos]), float(G[pos]), float(T[pos]), db = 'jaspar')
        exp.pfms.append(pfmpos)
        session.add(pfmpos)
    session.add(protein_seq_to_add)
    session.add(exp)

for protein in uniprobe.values():
    protein_name = protein['protein_mut']
    print(protein_name)
    sequence = protein['sequence']
    if not sequence:
        sequence = None
    protein_seq_to_add = Protein_sequence(protein_name, sequence)
    
    uniprot = protein['uniprot']
    tf_class = None
    family = None
    species = protein['species']
    pubmed = protein['pubmed']
    protein_infor = TF_info(tf_class = tf_class, family = family,\
                                species = species, uniprot = uniprot)
    
    exp = Experiment(pubmed_id = pubmed, assay = 'PBM')
    
    protein_seq_to_add.info.append(protein_infor)
    protein_seq_to_add.experiments.append(exp)
    pwm = protein['pwm']
    for dbd in protein['dbds']:
        if 'domain_sequence' in dbd:
            dbd_seq = dbd['domain_sequence']
        else:
            dbd_seq =  None
        if dbd_seq == '':
            dbd_seq = None
        if dbd['dbd_type']:
            dbd_type = dbd['dbd_type']
        else:
            dbd_type = None
        dbd_to_add = DBD(dbd_seq,  dbd_type)
        link = DBD_TF_Maps(protein_seq_to_add, dbd_to_add)
        session.add(link)
        protein_seq_to_add.protein_dbd_maps.append(link)
        dbd_to_add.protein_dbd_maps.append(link)
        session.add(dbd_to_add)
    
    for pos in range(len(pwm['A'])):
        A = pwm['A']
        C = pwm['C']
        G = pwm['G']
        T = pwm['T']
        pfmpos = PFM(pos,float(A[pos]),\
                        float(C[pos]), float(G[pos]), float(T[pos]), db = 'uniprobe')
        exp.pfms.append(pfmpos)
        session.add(pfmpos)
    session.add(protein_seq_to_add)
    session.add(exp)
    session.add(protein_infor)
session.commit()

Cbf1
CEH-22
Oct-1
Rap1
Zif268
Zfp691
Spdef
Jundm2
Eomes
Zfp740
Tcf7
Bcl6b
Plagl1
Zfp281
Klf7
Foxj3
Zic1
Sp4
E2F2
Gata3
Sfpi1
Rxra
Myb
Zbtb3
Rara
Zbtb7b
Bhlhb2
Mybl1
Sox13
Osr2
Sox8
Gmeb1
Gm397
Glis2
Esrra
Nr2f2
Hbp1
Sox11
Six6
Foxk1
Tcfap2a
Mtf1
Egr1
Zfp187
Sox1
Zfp105
Hnf4a
Ascl2
Tcf1
Zscan4
Sox14
Sox30
Hoxa3
Zfp128
Foxl1
Hic1
Gabpa
Foxa2
Sry
Sox17
Isgf3g
Zfp161
Zic2
Tcfap2c
Mafb
Nkx3-1
Zbtb12
Sox4
Sp100
Atf1
Osr1
Zfp410
Ehf
Mafk
Zic3
Foxj1
Sox21
Sox15
Sox5
Sox7
Tcf7l2
Irf4
Lef1
Sox18
Srf
Rfxdc2
Tcfap2e
Gcm1
E2F3
Bbx
Rfx4
Gata5
Gata6
Arid5a
Tcf3
Irf6
Smad3
Myf6
Max
Max
Tcfe2a
Irf5
Arid3a
Elf3
Sox12
Rfx3
Irf3
Tcfap2b
TBP
Irx2
Tcf2
Irx3
Six1
En2
Lhx2
Cart1
Dlx3
Hoxa6
Homez
Cart1
Lhx4
Hoxb3
Titf1
Vsx1
Pou6f1
Six3
Arx
Dlx1
Hoxd3
Alx4
Rhox11
Rhox11
Hoxa11
Irx3
Lmx1a
Lhx1
Irx4
Cdx1
Mrg1
Lhx8
Lhx6
Dlx2
Pitx2
Dmbx1
Lhx5
Uncx4.1
Obox5
Mrg2
Six2
Meox1
Pitx1
Hoxa10
Otx1
Gsc
Meis1
Tgif1
Bapx1
Hlx1
Hoxd13
Pknox1
Hoxc9
Hoxd10
Irx5
Barhl1
Hoxa9
Irx6
Hoxb4
Hoxc5
Shox2
Hoxd8
Evx2
Hoxa7
Hmbox1
Hoxc10
P