This notebook is for plotting alignments of the 1.1.3.15 data.<br/><br/>Copyright (C) 2019  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [13]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = 'alignment_vis_data'
if folder_name != '':
#     #make folders if they don't exist
#     if not exists(join(RAW_EXTERNAL, folder_name)):
#         os.makedirs(join(RAW_EXTERNAL, folder_name))

#     if not exists(join(INTERMEDIATE, folder_name)):
#         os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [10]:
from Bio import SeqIO
import vislib

In [26]:
def load_fasta(filepath, id_list):
    '''
    Take a fasta file and return the sequences specified.
    '''
    data = {}
    for record in SeqIO.parse(filepath, format='fasta'):
        header, seq = record.description, record.seq
        
        for uid in id_list:
            if uid in header:
                data[uid] = seq
                break
    return data


def load_aln_fasta(filepath):
    '''
    Load all sequences in an alignemnt fasta file
    '''
    data = {}
    for record in SeqIO.parse(filepath, format='fasta'):
        header, seq = record.description, record.seq
        data[header] = seq

    return data    

#### First visualize the gree 2-hydroxyglutarate cluster

In [39]:
group = ['A0A0K2AU21', 'S2DJ52', 'A0A0U6K8E5', 'R0EVG9', 'A0A021XIZ9', 'B7N6P4', 'A0A0J1FVG5', 'A0A0U5JSS4', 'A0A0L6Z980', 'A0A0R3K2G2', 'A0A0D6I3R9', 'A0A0J6V898', 'A0A0N7HBH8']
#group_activity = ['inactive', 'active', 'active', 'active', 'inactive', 'active', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive']
group_activity = ['left', 'left', 'left', 'left', 'left', 'left', 'mid', 'mid', 'mid', 'right', 'right', 'right', 'right']
group_grouping = {k:v for k,v in zip(group, group_activity)}

# now get the sequences
filepath = join(INTERMEDIATE, 'BRENDA', '1_1_3_15_BRENDA_sequences_filtered.fasta')
group_seqs = load_fasta(filepath, group)

# save as a separate fasta file
fasta_filepath = join(FINAL, 'alignment_vis_data', '%s.fasta' % 'green_2HG_cluster')
with open(fasta_filepath, 'w') as f:
    for uid in group:
        f.write('>%s\n%s\n' % (uid, group_seqs[uid]))

# align them 
aln_filepath = join(FINAL, 'alignment_vis_data', '%s_aln.fasta' % 'green_2HG_cluster')
infile = fasta_filepath
outfile = aln_filepath
mycmd = 'muscle -in %s -out %s -quiet' % (infile, outfile)
os.system(mycmd)

# load up the alignment
aln_data = load_aln_fasta(aln_filepath)

# now make the visualization
x = vislib.alignment(data=aln_data, 
                 filepath=join(FIGURES, 'green_2HG_cluster_aln.svg'), 
                 main='green_2HG_cluster', 
                 group_dict=group_grouping, 
                 group_colors=None, 
                 text_color='#424242')

#### Now visualize the blue cluster with some specific and some promiscuous

In [41]:
group = ['U2ENB5', 'A0A085FLP7', 'A0A073CBY9', 'Q5WFZ3', 'A0A0P7Z009', 'A0A0P7WDX3', 'K4LGZ0', 'M7MTF7', 'M4KHX7', 'F9UTP6', 'B9NTK7', 'V6AP96', 'Q5WIP4', 'A0A087RXW1', 'K0NHF1', 'D4MUV9', 'D4N087', 'A0A077SBA9', 'D4XIR1']
#group_activity = ['inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'inactive', 'active', 'active', 'inactive', 'active', 'active', 'active', 'active']
group_activity = ['left', 'left', 'left', 'left', 'left', 'left', 'left', 'left', 'mid', 'mid', 'right', 'right', 'right', 'right', 'right', 'right', 'right', 'right', 'right']
group_grouping = {k:v for k,v in zip(group, group_activity)}

# now get the sequences
filepath = join(INTERMEDIATE, 'BRENDA', '1_1_3_15_BRENDA_sequences_filtered.fasta')
group_seqs = load_fasta(filepath, group)

# save as a separate fasta file
fasta_filepath = join(FINAL, 'alignment_vis_data', '%s.fasta' % 'blue_cluster')
with open(fasta_filepath, 'w') as f:
    for uid in group:
        f.write('>%s\n%s\n' % (uid, group_seqs[uid]))

# align them 
aln_filepath = join(FINAL, 'alignment_vis_data', '%s_aln.fasta' % 'blue_cluster')
infile = fasta_filepath
outfile = aln_filepath
mycmd = 'muscle -in %s -out %s -quiet' % (infile, outfile)
os.system(mycmd)

# load up the alignment
aln_data = load_aln_fasta(aln_filepath)

# now make the visualization
x = vislib.alignment(data=aln_data, 
                 filepath=join(FIGURES, 'blue_cluster_aln.svg'), 
                 main='blue_cluster', 
                 group_dict=group_grouping, 
                 group_colors=None, 
                 text_color='#424242')