## This is a debuging ipython notebook

In [None]:
%load_ext autoreload
import argparse
import warnings
warnings.filterwarnings('ignore')

arg_parser = argparse.ArgumentParser()
        
    # # Add standard arguments
    # if not is_standalone:
    #     # If standalone, set in parser.
arg_parser.add_argument('--output_path', default='.',
                                help='[Default is current directory] Path to '
                                     'output directory.')


    # Arguments to generate the reference
arg_parser.add_argument('-r', '--reference', action='store_true',
                            help='Just generate the reference dataset for mapping')
arg_parser.add_argument('--min_species', type=int, default=30,
                            help='Min number of species in selected orthologous groups. \
                            If not selected it will be estimated such that around 1000 OGs are available.')
arg_parser.add_argument('--dna_reference', default='/Volumes/Untitled/eukaryotes.cdna.fa',
                            help='Reference fasta file that contains nucleotide sequences.')

    # Arguments to map the reads
arg_parser.add_argument('--ref_folder', default=None,
                            help='Folder containing reference files with sequences sorted by species.')
arg_parser.add_argument('--reads', default='/Users/daviddylus/Research/pore2tree/fritz_scripts/pipeline/Nano_reads/ERR1877969.fastq',
                            help='Reads to be mapped to reference.')
# arg_parser.add_argument('--reads', nargs='2',default=None, help='Reads to be mapped to reference.')


# Parse the arguments.
# reference = ['--ref_folder', '/Users/daviddylus/Research/pore2tree/pore2tree/tests/mapper/test1/db/']
argv = ['--output_path','/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/SRR5171076/wm_bg26', 
        '--reads', '/Users/daviddylus/mnt/read2tree/mouse/reads/illumina_hiseq_encode/SRR5171076_02X_0_0.fq /Users/daviddylus/mnt/read2tree/mouse/reads/illumina_hiseq_encode/SRR5171076_02X_1_0.fq']
args = arg_parser.parse_args(argv)
print(args)

In [None]:
import glob
output_path='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/SRR5171076/wm_bg26/'
for files in glob.iglob(output_path+'/**/*consensus.fa', recursive=True):
    print(files)

In [None]:
from Bio import AlignIO
import numpy as np
import re

for folder in glob.iglob(output_path+'/05_*', recursive=True):
    print(folder)
    all_coverages = []

    for file in glob.iglob(folder+'/*.phy'):
        align = AlignIO.read(file, "phylip-relaxed")
        for record in align:
            if 'SRR51' in record.id:
                seq = re.sub('-', '', str(record.seq))
                xx = seq.count("X")
                aa = len(seq)-xx
                all_coverages.append((aa/len(seq)))
    np_all_coverages = np.array(all_coverages)
    print(np.mean(np_all_coverages))
    print(np.std(np_all_coverages))


In [None]:
from Bio import SeqIO
import numpy as np
import re
import glob
output_path='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/nanopore/wm_bg26/'
for folder in glob.iglob(output_path+'/04_*', recursive=True):
    print(folder)
    all_coverages = []

    for file in glob.iglob(folder+'/*.fa'):
        align = SeqIO.parse(file, "fasta")
        for record in align:
            if 'merge' in record.id:
#                 seq = re.sub('-', '', str(record.seq))
                xx = str(record.seq).count("X")
                aa = len(record.seq)-xx
                all_coverages.append((aa/len(record.seq)))
    np_all_coverages = np.array(all_coverages)
    print(np.mean(np_all_coverages))
    print(np.std(np_all_coverages))

In [None]:
from Bio import SeqIO
from zoo.wrappers.aligners import Mafft
from tqdm import tqdm
import glob
import numpy as np
import os
import re
output_path='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/SRR5171076/wm_bg26/'
mapping='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/SRR5171076/wm_bg26/04_ogs_map_SRR5171076_20X_0_0/'
real='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/SRR5171076/m_bg25/01_ref_ogs_aa/'
all_diff = {}
for folder in glob.iglob(output_path+'/04_*', recursive=True):
    out_dict = {}
    print(folder)
    for file in tqdm(glob.iglob(folder+"/*.fa"), desc='Loading OGs ', unit=' ogs'):
        to_check = []
        map_rec = SeqIO.parse(file, "fasta")
        for record in map_rec:
            if 'SRR51' in record.id:
                to_check.append(record)
        if os.path.exists(real+os.path.basename(file)):
            ori_rec = SeqIO.parse(real+os.path.basename(file), "fasta")
            for record in ori_rec:
                if 'MOUSE' in record.id:
                    to_check.append(record)
                    out_dict[os.path.basename(file)] = to_check
    align_dict = {}
    for key, value in tqdm(out_dict.items(), desc='Aligning OGs ', unit=' alignments'):
        mafft_wrapper = Mafft(value, datatype="PROTEIN")
        mafft_wrapper.options.options['--localpair'].set_value(True)
        mafft_wrapper.options.options['--maxiterate'].set_value(1000)
        alignment = mafft_wrapper()
        align_dict[key] = alignment
    
    differences = []
    differences_dict = {}
    for key,alignment in align_dict.items():
        differences.append(get_align_diff(alignment))
        differences_dict[key] = get_align_diff(alignment)
    all_diff[folder] = differences_dict
    diff_np = np.array(differences)
    print(diff_np.mean())
    print(diff_np.std())

In [None]:
with open('csvfile.csv','w') as file:
    for key,value in all_diff.items():
        for key2,value2 in value.items():
            #print(key.split('_map_')[-1]+','+key2+','+'{}'.format(value2))
            file.write(key.split('_map_')[-1]+','+key2+','+'{}'.format(value2)+'\n')
    

In [None]:
def get_align_diff(alignment):
    diff = 0
    for i,value in enumerate(alignment[0].seq):
        if value is not 'X' and value is not '-':
            if alignment[0][i] is not alignment[1][i]:
                diff += 1
    return diff

In [None]:
import pyopa
defaults = pyopa.load_default_environments()
envs = defaults['environments']
env = envs[515]
align_dict_pyopa = {}
for key, value in out_dict.items():
    s1 = pyopa.Sequence(str(value[0].seq))
    s2 = pyopa.Sequence(str(value[1].seq))
    align_dict_pyopa[key] = pyopa.align_double(s1, s2, env)

In [None]:
differences_dict

In [None]:
dir(pyopa)

In [None]:
import os
from tqdm import tqdm
import glob
from Bio import SeqIO, Seq, SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from Bio.SeqIO.FastaIO import FastaWriter
mapping='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/nanopore/wm_bg26/04_ogs_map_merge_2X_0_0/'
to_check_map = []
to_check_original = []
for file in tqdm(glob.iglob(mapping+"*.fa"), desc='Loading OGs ', unit=' ogs'):
    map_rec = SeqIO.parse(file, "fasta")
    for record in map_rec:
        if 'merge' in record.id:
            to_check_map.append(record)
#     if os.path.exists(real+os.path.basename(file)):    
#         ori_rec = SeqIO.parse(real+os.path.basename(file), "fasta")
#         for record in ori_rec:
#             if 'MOUSE' in record.id:
#                 to_check_original.append(record)

handle = open('/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/nanopore/blast/2X_map_nanopore.fa', "w")
writer = FastaWriter(handle, wrap=None)
writer.write_file(to_check_map)

# handle = open('/Users/daviddylus/Desktop/01X_wr_original.fa', "w")
# writer = FastaWriter(handle, wrap=None)
# writer.write_file(to_check_original)

In [None]:
import os
from tqdm import tqdm
import glob
from Bio import SeqIO, Seq, SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from Bio.SeqIO.FastaIO import FastaWriter
mapping='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/illumina/SRR5171076/wm_bg26/without_mapping_to_rat/04_ogs_map_SRR5171076_01X_0_0/'
real='/Users/daviddylus/mnt/read2tree/mouse/read2tree_out/illumina/SRR5171076/m_bg25/01_ref_ogs_aa/'
to_check_map = []
to_check_original = []
for file in tqdm(glob.iglob(real+"*.fa"), desc='Loading OGs ', unit=' ogs'):
    ori_rec = SeqIO.parse(file, "fasta")
    for record in ori_rec:
            if 'MOUSE' in record.id:
                to_check_original.append(record)

# handle = open('/Users/daviddylus/Desktop/01X_wr_map.fa', "w")
# writer = FastaWriter(handle, wrap=None)
# writer.write_file(to_check_map)

handle = open('/Users/daviddylus/Desktop/original.fa', "w")
writer = FastaWriter(handle, wrap=None)
writer.write_file(to_check_original)

In [None]:
from tables import *
from pyoma.browser import db
import glob
from Bio import SeqIO, Seq, SeqRecord

h5_file = open_file("/Users/daviddylus/mnt/omabrowser/OmaServer.h5", mode="r")
bdb = db.Database(h5_file)
bdb_id_map = db.OmaIdMapper(bdb)
bdb_source = 'h5'

In [None]:
species_list = [row['UniProtSpeciesCode'].decode("utf-8") for row in bdb_id_map.genome_table]

if 'CANAV' in species_list:
    print('bla')
else:
    print('blub')

# print(species_list)

In [None]:

for file in glob.iglob("/Users/daviddylus/Research/read2tree/read2tree/tests/marker_genes/*.fa"):
    map_rec = SeqIO.parse(file, "fasta")
    for record in map_rec:
        species = record.description[record.description.find("[") + 1:record.description.find("]")]
        if len(species.split(" ")) > 1:
            new_id = species.split(" ")[0][0:3] + species.split(" ")[1][0:2]
            species = new_id.upper()


In [18]:
import os
from tqdm import tqdm
import glob
from Bio import SeqIO, Seq, SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from Bio.SeqIO.FastaIO import FastaWriter
folder = '/Users/daviddylus/mnt/read2tree/canva/read2tree/01_ref_ogs_aa/'
real = '/Users/daviddylus/mnt/read2tree/canva/oma/dataset1/Output_with_CANVA/OrthologousGroupsFasta/'
for_alignment = {}
for file in tqdm(glob.iglob(folder+"/*.fa"), desc='Loading OGs ', unit=' ogs'):
    if os.path.exists(real+os.path.basename(file)):
        ori_rec = list(SeqIO.parse(real+os.path.basename(file), "fasta"))
        for record in ori_rec:
            species = record.description[record.description.find("[")+1:record.description.find("]")]
            record.id = species
        for_alignment[os.path.basename(file)] = ori_rec



Loading OGs : 0 ogs [00:00, ? ogs/s][A
Loading OGs : 1 ogs [00:02,  2.56s/ ogs][A
Loading OGs : 3 ogs [00:02,  1.81s/ ogs][A
Loading OGs : 5 ogs [00:02,  1.29s/ ogs][A
Loading OGs : 7 ogs [00:02,  1.08 ogs/s][A
Loading OGs : 9 ogs [00:03,  1.50 ogs/s][A
Loading OGs : 11 ogs [00:03,  2.05 ogs/s][A
Loading OGs : 13 ogs [00:03,  2.78 ogs/s][A
Loading OGs : 1604 ogs [01:51, 14.36 ogs/s]


In [None]:
from zoo.wrappers.aligners import Mafft

align_list = []
counter = 0
for key, value in tqdm(for_alignment.items(), desc='Loading OGs ', unit=' ogs'): 
    mafft_wrapper = Mafft(value, datatype="PROTEIN")
    mafft_wrapper.options.options['--localpair'].set_value(True)
    mafft_wrapper.options.options['--maxiterate'].set_value(1000)
    alignment = mafft_wrapper()
    align_list.append(alignment)
    counter += 1
    if counter%100 == 0:
        print('{} of alignments done'.format(counter))

Loading OGs :   6%|▌         | 100/1604 [04:18<38:03,  1.52s/ ogs] 

100 of alignments done


Loading OGs :  12%|█▏        | 200/1604 [08:38<1:13:11,  3.13s/ ogs]

200 of alignments done


Loading OGs :  19%|█▊        | 300/1604 [14:53<1:16:00,  3.50s/ ogs]

300 of alignments done


Loading OGs :  25%|██▍       | 400/1604 [19:21<44:19,  2.21s/ ogs]  

400 of alignments done


Loading OGs :  31%|███       | 500/1604 [26:26<1:00:02,  3.26s/ ogs]

500 of alignments done


Loading OGs :  37%|███▋      | 600/1604 [30:55<40:00,  2.39s/ ogs]  

600 of alignments done


Loading OGs :  44%|████▎     | 700/1604 [36:17<48:13,  3.20s/ ogs]  

700 of alignments done


Loading OGs :  50%|████▉     | 800/1604 [41:12<29:21,  2.19s/ ogs]  

800 of alignments done


Loading OGs :  56%|█████▌    | 900/1604 [49:08<15:13,  1.30s/ ogs]  

900 of alignments done


Loading OGs :  62%|██████▏   | 1000/1604 [56:56<31:09,  3.10s/ ogs] 

1000 of alignments done


Loading OGs :  69%|██████▊   | 1100/1604 [1:04:02<18:30,  2.20s/ ogs]  

1100 of alignments done


Loading OGs :  75%|███████▍  | 1200/1604 [1:11:24<14:31,  2.16s/ ogs]  

1200 of alignments done


Loading OGs :  81%|████████  | 1300/1604 [1:17:42<37:48,  7.46s/ ogs]  

1300 of alignments done


Loading OGs :  87%|████████▋ | 1400/1604 [1:24:03<24:10,  7.11s/ ogs]

1400 of alignments done


Loading OGs :  94%|█████████▎| 1500/1604 [1:31:44<07:48,  4.50s/ ogs]

1500 of alignments done


Loading OGs :  96%|█████████▌| 1537/1604 [1:34:40<05:44,  5.15s/ ogs]