In [6]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
import sys
import json
from DB_setup import Protein_sequence, DBD, Experiment, PFM, TF_info, Base, DBD_TF_Maps
from sqlalchemy.orm import sessionmaker


#setting up database directory, database will be named tf.db for now
db_directory = 'C:\\Users\\Alex\\Documents\\fordyce_rotation\\tf.db'

#setting up engine to connect to the database, using sqlite
engine = create_engine('sqlite:///'+db_directory)

Base.metadata.create_all(engine)

jaspar_file, uniprobe_file = ['jaspar_success.json', 'uniprobe_final.json']

with open(jaspar_file ,'r') as jasp:
    jaspar = json.loads(jasp.read())

with open(uniprobe_file ,'r') as uni:
    uniprobe = json.loads(uni.read())

import re
protein_seqs = []
experiments = []
pfms = []
protein_info = []
dbds = []

Session = sessionmaker(bind=engine)
session = Session()

for tf, info in jaspar.items():
    protein_name = '_'.join(tf.split('::'))
    sequence = info['sequence']
    if not sequence:
        sequence = None
    protein_seq_to_add = Protein_sequence(protein_name, sequence)
    
    
    uniprot_ids = str('_'.join(info['acc']))
    protein_infor = TF_info( tf_class = info['class'], family = info['family'],\
                                species = str(info['species']), uniprot = uniprot_ids)

    exp = Experiment(pubmed_id = info['pubmed'], assay = info['type'])
    protein_seq_to_add.info = [protein_infor]
    protein_seq_to_add.experiments = [exp]
    
    if info['dbds']:
        for dbd in info['dbds']:
            if 'domain_sequence' in dbd:
                dbd_seq = dbd['domain_sequence']
            else:
                dbd_seq = None
    
            dbd_to_add = DBD(dbd_seq, dbd['dbd_type'])
            link = DBD_TF_Maps(protein_seq_to_add, dbd_to_add)
            session.add(link)
            protein_seq_to_add.protein_dbd_maps.append(link)
            dbd_to_add.protein_dbd_maps.append(link)
            session.add(dbd_to_add)
    
    session.add(protein_infor)
    
    A, C, G, T, _ = info['motif'].split('\n')
    A = re.findall(r'[\d]+.[\d]+', A)
    C = re.findall(r'[\d]+.[\d]+', C)
    G = re.findall(r'[\d]+.[\d]+', G)
    T = re.findall(r'[\d]+.[\d]+', T)
    for pos in range(len(A)):
        pfmpos = PFM(pos, float(A[pos]),\
                        float(C[pos]), float(G[pos]), float(T[pos]), db = 'jaspar')
        exp.pfms.append(pfmpos)
        session.add(pfmpos)
    session.add(protein_seq_to_add)
    session.add(exp)

for protein in uniprobe.values():
    protein_name = protein['protein_mut']
    sequence = protein['sequence']
    if not sequence:
        sequence = None
    protein_seq_to_add = Protein_sequence(protein_name, sequence)
    
    uniprot = protein['uniprot']
    tf_class = None
    family = None
    species = protein['species']
    pubmed = protein['pubmed']
    protein_infor = TF_info(tf_class = tf_class, family = family,\
                                species = species, uniprot = uniprot)
    
    exp = Experiment(pubmed_id = pubmed, assay = 'PBM')
    
    protein_seq_to_add.info.append(protein_infor)
    protein_seq_to_add.experiments.append(exp)
    pwm = protein['pwm']
    for dbd in protein['dbds']:
        if 'domain_sequence' in dbd:
            dbd_seq = dbd['domain_sequence']
        else:
            dbd_seq =  None
        if dbd_seq == '':
            dbd_seq = None
        if dbd['dbd_type']:
            dbd_type = dbd['dbd_type']
        else:
            dbd_type = None
        dbd_to_add = DBD(dbd_seq,  dbd_type)
        link = DBD_TF_Maps(protein_seq_to_add, dbd_to_add)
        session.add(link)
        protein_seq_to_add.protein_dbd_maps.append(link)
        dbd_to_add.protein_dbd_maps.append(link)
        session.add(dbd_to_add)
    
    for pos in range(len(pwm['A'])):
        A = pwm['A']
        C = pwm['C']
        G = pwm['G']
        T = pwm['T']
        pfmpos = PFM(pos,float(A[pos]),\
                        float(C[pos]), float(G[pos]), float(T[pos]), db = 'uniprobe')
        exp.pfms.append(pfmpos)
        session.add(pfmpos)
    session.add(protein_seq_to_add)
    session.add(exp)
    session.add(protein_infor)
session.commit()

In [None]:
dbds2 = []
for i in set(dbds):
    dbds2.append(DBD(*i))

pfms2 = []
for i in set(pfms):
    pfms2.append(PFM(*i))

experiments2 = []
for i in set(experiments):
    experiments2.append(Experiment(*i))

protein_info2 = []
for i in set(protein_info):
    protein_info2.append(TF_info(*i))

protein_seqs2 = []
for i in set(protein_seqs):
    protein_seqs2.append((Protein_sequence(*i)))