In [1]:
from mgkit.io import gff
from mgkit import kegg
import mgkit
import mgkit.plots
from collections import Counter
from glob import glob
import seaborn as sns
import pandas as pd
from mgkit.utils import dictionary
import itertools
import networkx as nx
from mgkit import graphs
import os, sys
from collections import Counter
from collections import defaultdict
import scipy as sp
import numpy as np
import scipy.stats
import datetime
import timeit
import re
import platform
import getpass
import argparse

mgkit.logger.config_log()

#On Server
#input_dir=""
#output_dir=""
user=getpass.getuser()
if "windows" in platform.platform().lower():
    windows=True
else:
    windows=False
if windows:
    core=os.path.join("C:\Users",user)
    g_drive="Google Drive\Honours"
else:
    core=os.path.join("/home",user)
    g_drive="grive/Honours"
print core

#Made some OS agnostic changes
#gff_dir=os.path.join(*[core,"Documents","Hons","Seaquence","francesco_data","gff_bins-2016-06-14"])
gff_dir=os.path.join(*[core,"Documents","Hons","Seaquence","francesco_data","gff_bins-2016-06-14b"])
#gff_dir=core+"/Documents/Hons/Seaquence/francesco_data/gff_bins-2016-06-14"

tax_file=os.path.join(*[core,g_drive,"metabolic_analysis","ID_TAX_BINS_TEMP.txt"])
shortened_tax_file=os.path.join(*[core,g_drive,"metabolic_analysis","Non_uniq_shortened_ID.txt"])
#tax_file=core+"/grive/metabolic_analysis/ID_TAX_BINS_TEMP.txt"

output_dir=os.path.join(*[core,g_drive,"metabolic_analysis"])
#output_dir=core+"/grive/metabolic_analysis/"

coral_kegg=os.path.join(*[output_dir,"KO_hits","plut.pathways.txt"])
#coral_kegg=output_dir+"KO_hits/plut.pathways.txt"

cmn_cpds=os.path.join(*[output_dir,"Automated_Network_Analyses","boring_cps.txt"])
#cmn_cpds=output_dir+"Automated_Network_Analyses/boring_cps.txt"


symbiodinium_kegg=os.path.join(*[output_dir,"KO_hits","SymbC15_firstpass_ko_mapping_protID_distinct_KO.txt"])
#symbiodinium_kegg=output_dir+"KO_hits/SymbC15_firstpass_ko_mapping_protID_distinct_KO.txt"

#microbial_kegg=os.path.join(*[output_dir,"KO_hits","Microbial_KO_mapping_protID.txt"])
microbial_kegg=os.path.join(*[output_dir,"KO_hits","TREMBL_SWISSPROT_Microbial_KO_mapping_protID.txt"])
#microbial_kegg=output_dir+"KO_hits/Microbial_KO_mapping_protID.txt"
all_kegg=os.path.join(*[output_dir,"KO_hits","all_kos.txt"])

hmm_dir=os.path.join(*[core,g_drive,"HMM_searches","Symbioses_test","euk_repeat_results"])

database_dir=os.path.join(output_dir,"Databases")

abundance_file=os.path.join(*[output_dir,"Misc_files","id_trimmed_relative_enriched_bin_abundance.tsv"])

raw_coverage_contig=os.path.join(*[output_dir,"Misc_files","bin_contig_sep_coverages.tsv"])

completeness_contamination=os.path.join(*[output_dir,"Misc_files","Completeness_Contamination_data.txt"])

gene_dir=os.path.join(*[core,g_drive,"eukaryote_like_repeats","gene_hits"])#"/home/baker/Documents/MountedDrive/seaquence/data/eukaryote_like_repeats/gene_hits"

plots_dir=os.path.join(output_dir,'Plots')

uniq_tax_file=os.path.join(*[output_dir, "BIN_uniq_shortened_ID.txt"])

def load_cmn_cpds(cmn_cpds):
    cpds=set([])
    with open(cmn_cpds,'r') as cpd_list:
        for line in cpd_list:
            cpd=line.strip().split("\t")[0]
            cpds.add(cpd)
    return cpds
            
common_cpds=load_cmn_cpds(cmn_cpds)

# General Data loading

def store_local_kegg_item_keys(kegg_items,database_dir):
    kc=kegg.KeggClientRest()
    all_item_names={}
    illegal_pairs=[("compound","orthology"),("orthology","compound")]
    for kegg_item in kegg_items:
        key_name=os.path.join(database_dir,"{0}_readable_names.tsv".format(kegg_item))
        if not os.path.isfile(key_name):
            item_names=kc.get_ids_names(kegg_item)
            save_readable_key(key_name,item_names,kegg_item)
            all_item_names[kegg_item]=item_names.keys()
        else:
            all_item_names[kegg_item]=load_readable_names(database_dir,[kegg_item],False)[kegg_item].keys()
    for kegg_item_1, kegg_item_2 in itertools.permutations(all_item_names.iterkeys(),2):
        print "considering the pair: {0}, {1}".format(kegg_item_1,kegg_item_2)
        if (kegg_item_1,kegg_item_2) not in illegal_pairs:
            shared_key_name=os.path.join(database_dir,"{0}_linked_{1}_database.tsv").format(kegg_item_1, kegg_item_2)
            if not os.path.isfile(shared_key_name):
                print "The processing of pair: {0},{1} has begun.".format(kegg_item_1,kegg_item_2)
                kc=kegg.KeggClientRest()
                linked_ids=kc.link_ids(kegg_item_2,all_item_names[kegg_item_1])
                save_key_pairings(shared_key_name,linked_ids,(kegg_item_1,kegg_item_2))
        else:
            pass
    return

def load_local_kegg_database_pairings(database_dir,kegg_item_pairs, process_all):
    '''Loads the local databases of kegg_item_1, kegg_item_2 pairings and return a dictionary of
    these pairings in the form kegg_item_1:kegg_items_2 (There can be more than one linked item). This
    loading is based on the earlier use of mgkits kc.link_ids to store all of the pairings needed.
    
    Input:
        database_dir   - The directory with the databases
        kegg_item_pairs- A list of kegg item pairs to load
        process_all    - A boolean decision as whether to load all existing pairs.
        
    Output: A dictionary linking either all existing kegg item pairs or just those specified. It has the form
    dict[item_1,item_2]={kegg_item_1:kegg_2_items}'''
    linking_dictionary={}
    if process_all:
        for file_name in glob(os.path.join(database_dir,'*database.tsv')):
            db_file=os.basename(file_name)
            kegg_1=db_file.split("_linked_")[0]
            kegg_2=db_file.split("_linked_")[1].split("_database")[0]
            linking_dictionary[(kegg_1,kegg_2)]={}             
            with open(file_name) as kegg_links:
                next(kegg_links)#Skip the header
                for line in kegg_links:
                    item_1,item_2=line.strip().split("\t")
                    item_2=item_2.split(";")
                    linking_dictionary[kegg_item_pair][item_1]=item_2
        return linking_dictionary
    
    for kegg_item_pair in kegg_item_pairs:
        file_name=os.path.join(database_dir,"{0}_linked_{1}_database.tsv").format(kegg_item_pair[0], kegg_item_pair[1])
        if os.path.isfile(file_name):
            linking_dictionary[kegg_item_pair]={}
            with open(file_name) as kegg_links:
                next(kegg_links) #skip the header
                for line in kegg_links:
                    item_1,item_2=line.strip().split("\t")
                    item_2=item_2.split(";")
                    linking_dictionary[kegg_item_pair][item_1]=item_2
    return linking_dictionary
                    
def load_readable_names(database_dir,kegg_items,process_all):
    '''Loads in the readable names for a specified kegg item from a list of databases.
    Input:
        database_dir        -  The directory with the databases.
        kegg_items          -  The kegg items to get the readable mapping for.
        process_all         -  Boolean - Should the function retrieve all available databases.
    Output:
        readable_item_dict  -  A dictionary of KEGG_ID: Readable name pairs'''
    readable_item_dict={}
    if process_all:
        for file_name in glob(os.path.join(database_dir,'*_readable_names.tsv')):
            desc_file=os.basename(file_name)
            kegg_item=desv_file.split("_readable_names.tsv")[0]
            readable_item_dict[kegg_item]={}
            with open(file_name) as kegg_descriptions:
                next(kegg_descriptions)
                for line in kegg_descriptions:
                    item_1,item_2=line.strip().split("\t")
                    readable_item_dict[kegg_item][item_1]=item_2
        return readable_item_dict
    
    for kegg_item in kegg_items:
        file_name=os.path.join(database_dir,'{0}_readable_names.tsv'.format(kegg_item))
        readable_item_dict[kegg_item]={}
        with open(file_name) as kegg_descriptions:
            next(kegg_descriptions)
            for line in kegg_descriptions:
                item_1,item_2=line.strip().split("\t")
                readable_item_dict[kegg_item][item_1]=item_2
    return readable_item_dict
            
            
def save_readable_key(key_name,item_names,kegg_item):
    df=pd.DataFrame([
    [col1,col2] for col1,col2 in item_names.iteritems()
                   ])
    df.columns=[kegg_item,"Description"]
    df.to_csv(key_name,sep="\t",index=None)
    return None

def save_key_pairings(shared_key_name,item_links,kegg_item_tuple):
    df=pd.DataFrame([
            [col1,";".join(col2)] for col1, col2 in item_links.iteritems()
        ])
    df.columns=[kegg_item_tuple[0],kegg_item_tuple[1]]
    df.to_csv(shared_key_name,sep="\t",index=None)
    return None

def remove_ko_pth_hits(file_path):
    
    with open(file_path,'r') as KO_PTH_pairs:
        out_dir=os.path.dirname(file_path)
        temp_file=open(os.path.join(out_dir,"temp.tsv"),'w')
        for line in KO_PTH_pairs:
            KO,pathways=line.strip().split("\t")
            pathways=pathways.split(";")
            pathways=[pathway for pathway in pathways if not pathway.startswith("ko")]
            pathways=";".join(pathways)
            new_line="{0}\t{1}\n".format(KO,pathways)
            temp_file.write(new_line)
    temp_file.close()
    
def make_local_rcn_eqn_database(database_dir):
    kc=kegg.KeggClientRest()
    all_reactions=load_readable_names(database_dir,["reaction"],False)["reaction"].keys()
    rcn_eqns=kc.get_reaction_equations(all_reactions,max_len=10)
    file_name=os.path.join(database_dir, "reaction_equation_links.tsv")
    df=rcn_eqn_pd_df(rcn_eqns)
    df.to_csv(file_name,sep="\t",index=False)
        
    return

def rcn_eqn_pd_df(rcn_eqn_dict):
    df=pd.DataFrame([
            [rcn, ";".join(in_cpds),";".join(out_cpds)] for rcn, cpds in rcn_eqn_dict.iteritems() for in_cpds,out_cpds in [cpds.values()]
    if in_cpds!=[] or out_cpds!=[]    
        ])
    #df.replace('','NA')
    df.columns=["Kegg_rcn_ID","side_1_cpds","side_2_cpds"]
    return df

def load_local_rcn_eqn_database(database_dir):
    file_name=os.path.join(database_dir,"reaction_equation_links.tsv")
    rcn_eqn_dict={}
    n_df=pd.read_csv(file_name,sep="\t")
    n_df.fillna('',inplace=True)
    return n_df.set_index("Kegg_rcn_ID").T.to_dict(orient='dict')

def load_local_rcn_eqn_database_set(database_dir):
    rcn_eqn_pairs=load_local_rcn_eqn_database(database_dir)
    return {rcn:{side:set(cpds.split(";")) for side,cpds in pairs.iteritems()} for rcn, pairs in rcn_eqn_pairs.iteritems()}
            
#Load in the coral data
def load_bin_names(tax_file):
    #Load bin_ids and bins_taxonomy from file.
    bin_names={}
    bin_pair=[]
    with open(tax_file,'r') as bin_tax_pair:
        bin_tax_pair.readline()
        for line in bin_tax_pair:
            bin_pair.append(tuple(line.strip().split("\t")))

    bin_names={bin_id:taxonomy for taxonomy, bin_id in bin_pair}
    return bin_names

def make_local_complete_module_info_db(database_dir):
    '''Creates a local database of the module definitions.'''
    all_modules=load_readable_names(database_dir,["module"],False)["module"].keys()
    kc=kegg.KeggClientRest()
    entries={}
    max_len=10
    post_processed_defs={}
    print "There are a total of {0} modules to parse".format(len(all_modules))
    N_modules=len(all_modules)
    for i in xrange(0,N_modules,max_len):
        if N_modules-i<max_len:
            n_entries=N_modules-i
        else:
            n_entries=max_len      
        query="+".join(all_modules[i:i+n_entries])
        kegg_entries=kc.get_entry(query)
        hits=re.findall("\nDEFINITION(.*)\n",kegg_entries)
        print i, i+max_len-1,"n_hits:{0}".format(len(hits))
        #if len(hits)!=max_len:
        #    print m
        for module, definition in itertools.izip(all_modules[i:i+10],hits):
            new_def=definition.strip().replace(" --"," ").replace("-- "," ").replace("  "," ").strip()
            entries[module]=new_def
            if "M" in definition:
                print module,definition
                post_processed_defs[module]=new_def
        #These post_processed modules should be modules defined in terms of other modules.
    for module, definition in post_processed_defs.iteritems():
        new_def=definition
        print "This is the definition being considered.", new_def
        new_defs=re.split("[, +-]",definition)
        for item in new_defs:
            simp_item=item.strip(")").strip("(")
            if simp_item.startswith("M"):
                print "This is the current item",item
                new_def=new_def.replace(simp_item,"("+entries[simp_item]+")")
        print new_def
        entries[module]=new_def
        
    temp_entries=entries
    protein_complexes='(.)?([K][0-9]+[+]){1,}[K][0-9]+(.)?'
    for module, definition in temp_entries.iteritems():
        for match in re.finditer(protein_complexes, definition):
            match_str=match.group()
            if match_str.startswith("(") and match_str.endswith(")"):
                pass
            elif match_str[-1].isdigit() and match_str[0]=="K":
                match_str=match_str[:] #Trim random end characters
                new_match="("+match_str+")"
                entries[module]=entries[module].replace(match_str,new_match)
            elif match_str[0]=="K":
                match_str=match_str[0:-1] #Trim random end characters
                new_match="("+match_str+")"
                entries[module]=entries[module].replace(match_str,new_match)
            elif match_str[-1].isdigit():
                match_str=match_str[1:] #Trim random end characters
                new_match="("+match_str+")"
                entries[module]=entries[module].replace(match_str,new_match)
            else:
                match_str=match_str[1:-1] #Trim random end characters
                new_match="("+match_str+")"
                entries[module]=entries[module].replace(match_str,new_match)
              
        if i%100==0:
            kc=kegg.KeggClientRest()
            print module
    print "{0} modules were parsed".format(len(entries))
    df=pd.DataFrame([
            [module,entry] for module,entry in entries.iteritems()
        ])
    df.columns=["Kegg_id","Kegg_definition"]
    df.to_csv(os.path.join(database_dir,"Module_definitions_pairs_db.tsv"),sep="\t",index=None)
    
    return

def load_local_complete_module_info_db(database_dir):
    '''Loads a local database of kegg definitions'''
    def_dict={}
    with open(os.path.join(database_dir,"Module_definitions_pairs_db.tsv")) as definitions:
        next(definitions)
        for line in definitions:
            module,kegg_def=line.strip().split("\t")
            def_dict[module]=kegg_def
    
    return def_dict

def make_new_trusted_database(database_dir):
    '''
    Definition:
        This function will take an entire kegg module definition file and will create
        a new local database with the expressions written so that they can simple be
        evaluated when loading the files.
    Input: 
        database_dir: str
            A directory containing the database of kegg module definitions.
    Output:
        None
    Calls:
        replacement: Turns kegg definitions in logical nested tuples of sets.
    '''
    old_module_def=load_local_complete_module_info_db(database_dir)
    #print "This is the old module information",old_module_def
    new_pd_df=[""]*len(old_module_def)
    i=0
    for module,definition in old_module_def.iteritems():
        try:
            logical_evaluation=replacement(definition,False)[1]
            new_pd_df[i]=[module,logical_evaluation]
            i+=1
                
        except TypeError:
            print "TypeError 2:",module, definition
        except SyntaxError:
            print "SyntaxError 2:",module, definition
        except NameError:
            print "NameError 2:",module ,definition
    #print new_pd_df      
    new_pd_df=pd.DataFrame(new_pd_df)
    #print "The second checkpoint."
    new_pd_df.columns=["ModuleID","KEGG_log_expr"]
    
    new_pd_df.to_csv(os.path.join(database_dir, "module_kegg_log_expr.tsv"),header=True,sep="\t",index=False)
    return None

def fix_module_orthology_pairs(database_dir):
    '''Replaces the occurences of modules in the module-orthology links to their corresponding KOs'''
    all_pairs=load_local_kegg_database_pairings(database_dir,[["module","orthology"]], False)["module","orthology"]
    new_pairings=[]
    for module,kos in all_pairs.iteritems():
        new_items=[]
        rand_module=False
        for item in kos:
            if item.lower().startswith("m"):
                new_items.extend(all_pairs[item])
                rand_module=True
            else:
                new_items.append(item)
        if rand_module:
            new_pairings.append((module, list(set(new_items))))
    for (module, new_items) in new_pairings:
        all_pairs[module]=new_items
        
    df=pd.DataFrame([
            [col1,";".join(col2)] for col1, col2 in all_pairs.iteritems()
        ])
    df.columns=[kegg_item_tuple[0],kegg_item_tuple[1]]
    fixed_name=os.path.join(database_dir,"")
    df.to_csv(fixed_name,sep="\t",index=None)
    return
        
def load_local_cleaned_definition_db(database_dir):
    cleaned_db={}
    with open(os.path.join(database_dir,"module_kegg_log_expr.tsv")) as paired_exprs:
        next(paired_exprs) #Skip header
        for line in paired_exprs:
            module,expr=line.split("\t")
            kegg_log=eval(expr)
            if not isinstance(kegg_log,tuple):
                kegg_log=tuple([kegg_log])
            cleaned_db[module]=kegg_log
            
    return cleaned_db

# For remaking graphs

pathways = {
    'carbon': ['map01200'],
    'nitrogen-sulfur-fatty_acid-photosynthesis': ['map00910', 'map00920', 'map01212', 'map00195'],
    'oxidative_phosphorylation': ['map00190'],
    'two-component': ['map02020'],
    'amino-acids':['map01230'],
    #'thiamine-metabolism':'map00730',
    #'riboflavin-metabolism':'map00740',
    #'Vitamin-B6-metabolism':'map00750',
    #'Nicotinate&Nicotinamide-metabolism':'map00760',
    #'PantoThenate and CoA Biosynthesis':'map00770',
    #'Biotin-Metabolism':'map00780',
    #'Lipoic-Acid-Metabolism':'map00785',
    #'Folate-Biosynthesis':'map00790',
    #'OneCarbonPoolByFolate':'map00670',
    #'retinol-metabolism-animals':'map00830',
    #'porphyrin&ChlorophyllMetabolism':'map00860',
    #'Ubiquinone&OtherTerpenoid-QuinoneBiosynthesis':'map00130',
    'vitamins&cofactors':['map00730','map00740','map00750','map00760','map00770','map00780','map00785','map00790','map00670','map00830','map00860','map00130'],
    #"Alanine, aspartate and glutamate metabolism":'map00250',
    #"Cysteine and methionine metabolism":'map00270',
    #"Glycine, serine and threonine metabolism":'map00260',
    #"Valine, leucine and isoleucine degradation":'map00280',
    #"Valine, leucine and isoleucine biosynthesis":'map00290',
    #"Lysine biosynthesis":'map00300',
    #"Lysine degradation":'map00310',
    #"Arginine biosynthesis":'map00220',
    #"Arginine and proline metabolism":'map00330',
    #"Histidine metabolism":'map00340',
    #"Tyrosine metabolism":'map00350',
    #"Phenylalanine metabolism":'map00360',
    #"Tryptophan metabolism":''map00250'map00380',
    #"Phenylalanine, tyrosine and tryptophan biosynthesis":'map00400',
    "AminoAcidMetabolism":['map00270','map00260','map00280','map00290','map00300','map00310','map00220',\
                           'map00330','map00340','map00350','map00360','map00380','map00400']
    #"beta-Alanine metabolism":'map00410',
    #"Taurine and hypotaurine metabolism":'map00430',
    #"Phosphonate and phosphinate metabolism":'map00440',
    #"Selenocompound metabolism":'map00450',
    #"Cyanoamino acid metabolism":'map00460',
    #"D-Glutamine and D-glutamate metabolism":'map00471',
    #"D-Arginine and D-ornithine metabolism":'map00472',
    #"D-Alanine metabolism":'map00473',
    #"Glutathione metabolism":'map00480',
   # "Metabolisms of other amino acids":['map00410','map00430','map00440','map00450','map00460','map00471','map00472','map00473','map00480']
    ,"Glycosaminoglycan degradation & Synthesis":["map00531","map00532","map00534"] ,
    "Bacterial Secretion Systems":["ko03070"],
    "phosphotransferase system (PTS)":["ko02060"],
    "ABC transporters":["ko02010"],
    "N-Glycan biosynthesis": ["map00510"],
    "CationicAntiomicrobialPeptide_CAMP_resistance":["map01503"],
    "Vancomycin_Beta-lactamResistance":["map01502","map01501"],
    "Sulfatases":[]

}

pathways ={key:[item.replace("ko","map") for item in items] for key,items in pathways.iteritems()}

bin_names=load_bin_names(tax_file)

def pathway_to_modules(pathway_dict,database_dir):
    links=load_local_kegg_database_pairings(database_dir,[("pathway","module")], False)["pathway","module"]
    pathways={path:list(set(itertools.chain(*[links[egx] for egx in pathway if egx in links]))) for path, pathway in pathway_dict.iteritems()}
    return pathways

MO_pathways=pathway_to_modules(pathways,database_dir)

####################################
# Abundance parsing
####################################
def load_relative_abundance(file_name):
    abundance_data=pd.DataFrame.from_csv(file_name,sep="\t")
    new_index=np.array(pd.Series(abundance_data.index.values).str.strip("_genomic"))
    abundance_data.set_index(new_index,inplace=True)
    return abundance_data


def rel_abundance_to_dict(df):
    abundance_dict={}
    column_names=df.columns.values
    for genome, rel_abund in df.iterrows():
        abundance_dict[genome]={}
        for i,column_name in enumerate(column_names):
            abundance_dict[genome][column_name]=round(rel_abund[i],9)
    return abundance_dict

def get_abundance(file_name):
    return rel_abundance_to_dict(load_relative_abundance(file_name))

def only_key_abundance(abundance_dict,unique_key):
    reduced_dict={}
    for genome, rel_abund_dict in abundance_dict.iteritems():
        for sample, abundance in rel_abund_dict.iteritems():
            if unique_key in sample:
                reduced_dict[genome]=abundance
    return reduced_dict

def reduced_abundance(file_name,unique_key):
    
    return only_key_abundance(get_abundance(file_name),unique_key)
    

def load_coverages(file_name):
    '''Loads in the coverage file with a separate coverage for each contig. '''
    with open(file_name) as coverage_file:
        header=coverage.readline()
    header=tuple(header.strip().split("\t"))
    coverage_file=np.genfromtxt(fule_name,delimiter="\t",names=True)
    return

def normalised_coverages(read_counts=True):
    return

def load_completeness_contamination(file_name):
    genome_data=pd.DataFrame.from_csv(file_name,sep="\t")
    genome_data_dict={}
    data_names=["completeness","contamination"]
    for genome, row in genome_data.iterrows():
        genome_data_dict[genome]={}
        for item_id,item in itertools.izip(data_names,row):
            genome_data_dict[item_id]=item
    return genome_data

#def 

def estimate_binning_completeness(coverage_file,completeness_file, total_counts_file):
    
    return

#################################
# KEGG Completeness
#################################

def replacement(definition,return_string=False):
    '''
    Description:
        Turns an irregular definition string into a set of KOs in nested tuples to indicate their relationship and to
        prepare them for processing.
    Input:
        definition: string
            Module definition as defined in KEGG    
    output:
        definition: Same as above
        new_nesting: tuple of tuples of sets
            The new logical form of the definition to use in evaluating completeness.
        
    Notes: 
        This function uses eval which a security risk. Caution should be taken in using this function
    '''
    
    logical_chars="[+ ,-]"
    pattern="K[0-9]{5}"
    new_expression=definition
    logical_groups="([K][0-9]+,){1,}[K][0-9]+" #Find any group of KOs (1 or more) separated by commas
    #non_extended_groups='(K[0-9]{5}[^,K0-9\n\]]*){1,}(K[0-9]{5}[^,])' #Get any none comma separated chunk of KOs
    non_extended_groups='([^,]|^)(K[0-9]{5}[^,K0-9\n\]\)\[]*){1,}([\n \+\"\]\[\(\)]|$)'
    end_KOs='(K[0-9]{5})$'
    repeated_set='(set\(\[){2,}(["][K][0-9]+["][^^ )(+.\"-]?).*?(\]\)){1,2}'
    set_in_set='set\(\[[K0-9",]*(set\(\[)"[K][0-9]+"\]\)'
    rear_match="([^0-9][,-]K[0-9]{5})"
    forward_match='(K[0-9]{5})[,-][\(]'
    new_expression=new_expression.replace(" -","-")
    new_expression=new_expression.replace(", ",",")
    ko_set_matches=[]
        
    for match in re.finditer(logical_groups,new_expression):
        #print match.group()
        ko_set_matches.append(match.group())
        #new_expression=new_expression.replace(match.group(),"set(["+match.group()+"])",1)
    set_matches=['']*len(ko_set_matches)
    cleaned_matches=[match.strip(",") for match in ko_set_matches]
    ko_set_matches=set(cleaned_matches)
    
    
    for i,match in enumerate(ko_set_matches):
        set_matches[i]=match
        new_expression=new_expression.replace(match,"set(["+match+"])")
        
    step_0_1=new_expression
    for match in re.finditer(non_extended_groups,new_expression):
        if match:
            new_match=match.group()
            #print type(new_match), new_match
            for sub_match in re.findall(pattern,new_match):
                new_match=new_match.replace(sub_match,"set(["+sub_match+"])")
            new_expression=new_expression.replace(match.group(),new_match)
            
    for match in re.finditer(rear_match,new_expression):
        if match:
            new_match=match.group()
            #print type(new_match), new_match
            for sub_match in re.findall(pattern,new_match):
                new_match=new_match.replace(sub_match,"set(["+sub_match+"])")
            new_expression=new_expression.replace(match.group(),new_match)
            
    for match in re.finditer(forward_match,new_expression):
        if match:
            new_match=match.group()
            #print type(new_match), new_match
            for sub_match in re.findall(pattern,new_match):
                new_match=new_match.replace(sub_match,"set(["+sub_match+"])")
            #print new_match
            new_expression=new_expression.replace(match.group(),new_match)
                
    step_0_2=new_expression
    for match in set(re.findall(pattern,new_expression)):
        new_expression=new_expression.replace(match,"\""+match+"\"")
    step_1=new_expression
    #print new_expression
    
    new_expression=new_expression.replace(",",",\",\",")
    #non_set_comma='.{6}[^\"],[^\"].{6}' #Extends sides to try and ensure uniqueness
    #for match in set(re.findall(non_set_comma,new_expression)):
    #    new_expression=new_expression.replace(match,",\",\",".join(match.split(",")))
        
    new_expression=new_expression.replace(" ",",\" \",")
    #print new_expression
    new_expression=new_expression.replace("-",",\"-\",")
    new_expression=new_expression.replace("+",",\"+\",")
    step_2=new_expression
    #print new_expression
    new_expression=new_expression.replace("\"\"","\"")
    new_expression=new_expression.replace(",,",",")
    new_expression=new_expression.replace(",]","]")
    new_expression=new_expression.replace(",)",")")
    new_expression=new_expression.replace("\"\"","\"")
    step_3=new_expression
    #No Longer needed due to fix in tests.

    for match in re.finditer(repeated_set,new_expression):
        #print "This is the match", match.group()
        new_match=match.group().strip("set([")
        new_match=new_match.strip("])")
        new_match="set(["+new_match+"])"
        #print
        #print "This is the new match", new_match
        new_expression=new_expression.replace(match.group(),new_match)
    
    for match in re.finditer(set_in_set,new_expression):
        new_match=match.group()
        #print new_match.split("set([")
        blank, section_1,section_2=new_match.split("set([")
        section_2=section_2.strip("])")
        #print section_2
        new_match="set(["+section_1+section_2
        new_expression=new_expression.replace(match.group(),new_match)
    step_4=new_expression
    isolated_start="^\"K[0-9]{5}\""
    for match in re.findall(isolated_start,new_expression):
        new_expression=new_expression.replace(match,"("+match+",)")
        
    new_expression="("+new_expression+")"
    
    
        
    
    if return_string:
        return new_expression
    try:
        new_nesting=eval(new_expression)
        return definition,new_nesting 
    

        
    except TypeError:
        print definition
        print "0_1",step_0_1
        print "0_2",step_0_2
        print "Step 1:", step_1
        print "Step 2:", step_2
        print "Step 3:", step_3
        print "Step 4:", step_4
        print ";".join(set_matches)
        print "Type error", new_expression
        raise
        
    except SyntaxError:
        print definition
        print "0_1",step_0_1
        print "0_2",step_0_2
        print "Step 1:", step_1
        print "Step 2:", step_2
        print "Step 3:", step_3
        print "Step 4:", step_4
        print ";".join(set_matches)
        print "Syntax error", new_expression
        raise
        
    except NameError:
        print definition
        print "0_1",step_0_1
        print "0_2",step_0_2
        print "Step 1:", step_1
        print "Step 2:", step_2
        print "Step 3:", step_3
        print "Step 4:", step_4
        print ";".join(set_matches)
        print "NameError", new_expression
        raise
        
    return
        
     

def alt_eval_kegg_bool(kegg_expr,ko_set):
    '''
    Description:
        Evaluates a list of boolean expressions blocks to get a list of T, F results summarising the module completeness.
    Input: 
        kegg_expr: List of sets
            A kegg expression consting of KOs in nested tuples. eg, (KO1 ((KO2,KO3-KO4),KO5).
            The separators represent the kegg boolean separators.

        ko_set: set
            The set of KOs to be evaluated for compelteness in this particular kegg expression.
    Calls:
        eval_kegg_bool: function
            The workhorse of  this function - recursively evaluates each element in kegg_expr to
            decide if it is actually true or false.
    '''
    n_elements=len(kegg_expr)
    results_vec=["na"]*n_elements
    for i in xrange(0,n_elements,2):
        current_element=kegg_expr[i]
        if isinstance(current_element,tuple):
            side_1_result=eval_kegg_bool(current_element,ko_set)
        else:
            side_1_result= len(ko_set & current_element)>0

        full_result=side_1_result
        #print full_result
        results_vec[i]=full_result
    #print results_vec
    for i,element in enumerate(results_vec):
        if not isinstance(element,bool):
            results_vec[i]=kegg_expr[i]
    
    return (n_elements+1)/2,results_vec
    
    
def eval_kegg_bool(kegg_expr,ko_set):
    '''
    Description:
        A recursive implementation of the kegg boolean logic for evaluating based on a set of KOs if a module is complete.
        If given a tuple it will recursively search down for more tuples and evaluate them at the lowest level to move up and
        finally finish evaluating the complete block. 
    Input: 
        kegg_expr: List of sets
            A kegg expression consting of KOs in nested tuples. eg, (KO1 ((KO2,KO3-KO4),KO5).
            The separators represent the kegg boolean separators.
        ko_set: set
            The set of KOs to be evaluated for compelteness in this particular kegg expression.
    Calls:
        eval_kegg_bool: function
            Evalutes logical KEGG blocks.
    '''
    n_elements=len(kegg_expr)
    #vector=np.array(["na"]*((n_element+1)/2)-1)
    for i in xrange(0,n_elements-1,2):
        #print "THe current kegg expression getting evaluated", kegg_expr[i:i+3]
        side_1,log_op,side_2=kegg_expr[i:i+3]
        #print "This is the logical operater being used",log_op
        if log_op==" " or log_op=="+":
            #print "Entering +  recursion"
            if isinstance(side_1,tuple):
                side_1_result=eval_kegg_bool(side_1,ko_set)
            else:
                side_1_result= len(ko_set & side_1)>0
            if isinstance(side_2,tuple):
                side_2_result=eval_kegg_bool(side_2,ko_set)
            else:
                side_2_result=len(ko_set & side_2)>0
            full_result=side_1_result and side_2_result
            
        elif "," in log_op:
            #print "Entering , recursion"
            if isinstance(side_1,tuple):
                side_1_result=eval_kegg_bool(side_1,ko_set)
            else:
                side_1_result= len(ko_set & side_1)>0
            if isinstance(side_2,tuple):
                side_2_result=eval_kegg_bool(side_2,ko_set)
            else:
                side_2_result=len(ko_set & side_2)>0     
            full_result=side_1_result or side_2_result
        elif "-" in log_op:
            #print "Entering - recursion"
            if isinstance(side_1,tuple):
                side_1_result=eval_kegg_bool(side_1,ko_set)
            else:
                side_1_result= len(ko_set & side_1)>0
            if isinstance(side_2,tuple):
                side_2_result=eval_kegg_bool(side_2,ko_set)
            else:
                side_2_result=len(ko_set & side_2)>0
            full_result=side_1_result
            
        else:
            print log_op, "There seems to have been an error:"
        #print "The result for side 1", side_1_result, side_1
        #print "The result for side 2", side_2_result, side_2
    #print "The final results being returned", full_result
    return full_result

def block_level_completeness(results_vector,correct_partial,nested_descr,ko_set):
    '''
    Description:
        Calculates the percent completness of a KEGG module in one of two ways. It either looks at the number of
        logical blocks complete (block level completeness) or also adds a percentage adjustment for how complete the
        incomplete blocks are.
    Input:
        results_vector: List of Booleans
            A list containing the results of evaluating a kegg module KO hits as a boolean expression.
        correct_partial: Boolean
            Indicating whether to try and account for the partial completeness of some logical blocks.
        nested_descr: nested tuple of sets
            A logical grouping of KEGG blocks into tuples with sets of KOs as the lowermost elements.
    Output:
        completeness_perc:  float in [0,1]
            Percent module completeness according to one of two methods.'''
    if isinstance(nested_descr,set):
        return len(nested_desc & ko_set) > 0
    
    keep_indices=True
    log_blocks=make_logical_blocks(results_vector,keep_indices)
    position_mapping=make_position_mapping(log_blocks)
    if keep_indices:
        log_blocks=extract_logical_values(log_blocks)
    else:
        pass
    n_tot=len(log_blocks)
    filled_blocks=[any(block) for block in log_blocks]
    n_filled_blocks=sum(filled_blocks)
    adjustment=["na"]*len(log_blocks)
    
    if not correct_partial:
        completeness_perc=float(n_filled_blocks)/n_tot
        return completeness_perc
    else:
        for i,block in enumerate(log_blocks):
            if not any(block):
                n_max_hits=len(block)
                running_total=0
                for j,item in enumerate(block):
                    if item:
                        running_total+=1
                    else:
                        bool_index=position_mapping[i][j]
                        #print "The boolean index:", bool_index
                        #print position_mapping
                        kegg_bool=nested_descr[bool_index]
                        #print kegg_bool
                        running_total+=module_completeness_proportion(kegg_bool,ko_set,correct_partial)
                adjustment[i]=float(running_total)/n_max_hits
            else:
                adjustment[i]=1    
        n_filled_blocks=sum(adjustment)
        completeness_perc=float(n_filled_blocks)/n_tot
        return completeness_perc
    
def extract_logical_values(logical_blocks):
    return [[item[1] for item in block] for block in logical_blocks]

def make_position_mapping(log_blocks):
    mapping={}
    for i, block in enumerate(log_blocks):
        mapping[i]={j:item[0] for j,item in enumerate(block)}
    #print mapping
    return mapping

def module_completeness_proportion(kegg_bool,ko_set,correct_partial):
    '''Returns the completeness of the current kegg_boolean.
    Input:
        
    Output:
        
    Calls:
        block_level_completeness: Calculate the % of kegg blocks which are complete.'''
    if isinstance(kegg_bool,set):
        return len(kegg_bool & ko_set) > 0
    #print "Kegg bool:",kegg_bool
    #print "ko_set:",ko_set
    n_el,results_vector=alt_eval_kegg_bool(kegg_bool,ko_set)
    #print "This is the result vector:",results_vector
    completeness_perc=block_level_completeness(results_vector,correct_partial,kegg_bool,ko_set)
    
    return completeness_perc
    

def make_logical_blocks(results_vector,keep_indices):
    '''
    Description:
        Turn the uppermost level of results from a KEGG boolean into a series of logical blocks. I.e if I had 
        a vector [T and F and T or F or T] then the blocks formed will be [[T],[F],[T,F,T]]. 
    Input:
        results_vector: List of Bools
            A list containing the results of evaluating a kegg module KO hits as a boolean expression.
    Output:
        log_block: list of lists of bools
            A list composed of the logical blocks needed to decide if a boolean is "complete".'''
    
    operator_set=set([" ","-",",","+"])
    log_blocks=[]
    current_block=[]
    previous_logical=""
    log_operators=[" ",",","+","-"]
    for i,item in enumerate(results_vector):
        if item not in log_operators:
            if not current_block:
                if keep_indices:
                    current_block.append((i,item))
                else:
                    current_block.append(item)
#            elif i==(len(results_vector)-1):
#                log_blocks.append(current_block)
            else:
                if previous_logical==" " or previous_logical=="+":
                    log_blocks.append(current_block)
                    if keep_indices:
                        current_block=[(i,item)]
                    else:
                        current_block=[item]
                elif previous_logical==",":
                    #print item
                    if keep_indices:
                        current_block.append((i,item))
                    else:
                        current_block.append(item)
                    #print current_block
                elif previous_logical=="-":
                    pass
        else:
            previous_logical=item
    log_blocks.append(current_block)
            
    return log_blocks

def test_all_local_modules(database_dir):
    #758 comparisons are to be made.
    completeness_dict={}
    #Load all possible KOs
    MO_KO_pairs=load_local_kegg_database_pairings(database_dir,[("Module","orthology")], False)[("Module","orthology")]
    MO_KO_pairs={MO:set(KOs) for MO, KOs in MO_KO_pairs.iteritems()}
    #Use this as the comparison set.
    log_kegg_exprs=load_local_cleaned_definition_db(database_dir)
    #Screen every single module for compelteness (should all be 1.0)
    for module,expression in log_kegg_exprs.iteritems():
        completeness_dict[module]=module_completeness_proportion(expression,MO_KO_pairs[module],True)
        
    
    failures={Module:completeness for Module,completeness in completeness_dict.iteritems() if completeness<1}
    print failures
    
    return completeness_dict

genome_taxonomy=load_bin_names(shortened_tax_file)
full_genome_taxonomy=load_bin_names(tax_file)
uniq_genome_taxonomy=load_bin_names(uniq_tax_file)


C:\Users\Baker


# Metabolic Graphs + repeats enrichment

In [19]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LinearSegmentedColormap
import sklearn.manifold as manifold
#from matplotlib import rcParams
from mpl_toolkits.axes_grid import inset_locator
#rcParams.update({'figure.autolayout': True})

def load_completeness_matrix(completeness_file):
    df=pd.read_csv(completeness_file,index_col=[0,1],skipinitialspace=True,sep="\t")
    
    return df

def perform_PCA(complete_matrix):
    transposed_mat=pd.DataFrame.transpose(complete_matrix)
    cov_frame=pd.DataFrame.cov(transposed_mat)
    pca=PCA()
    pca_values=pca.fit(cov_frame)
    #print "The tranposed matrix", transposed_mat
    new_values=pca_values.transform(transposed_mat)

    obs_df=pd.DataFrame(new_values)
    obs_df.index=transposed_mat.index
    n_col=obs_df.shape[1]
    col_names=["PC{0}".format(j) for j in xrange(1,n_col+1)]
    obs_df.columns=col_names
    
    #Extract the component values for each genome
    
    return obs_df, pca_values

def perform_ISOMAP(complete_matrix):
    
    transposed_mat=pd.DataFrame.transpose(complete_matrix)

    iso=manifold.Isomap(n_neighbors=3,n_components=5)

    iso_model=iso.fit(transposed_mat)

    #print dir(iso_model)
    iso_val=iso_model.transform(transposed_mat)
    iso_val=pd.DataFrame(iso_val)
    iso_val.index=transposed_mat.index
    #print iso_model.reconstruction_error()
    #print iso_val.shape
    n_col=iso_val.shape[1]
    #print iso_val
    col_names=["Dim{0}".format(j) for j in xrange(1,n_col+1)]
    iso_val.columns=col_names
    
    #print iso_val
    
    return iso_val

def perform_MDS(complete_matrix):
    
    transposed_mat=pd.DataFrame.transpose(complete_matrix)
    mds=manifold.MDS(n_components=3)
    mds_val=mds.fit_transform(transposed_mat)
    #mds_val=mds_model.transform(transposed_mat)
    mds_val=pd.DataFrame(mds_val)
    mds_val.index=transposed_mat.index
    n_col=mds_val.shape[1]
    col_names=["Dim{0}".format(j) for j in xrange(1,n_col+1)]
    mds_val.columns=col_names
    
    return mds_val

def grouped_microbes(complete_matrix):
    
    
    return

def pathway_to_modules(pathway_dict,database_dir):
    links=load_local_kegg_database_pairings(database_dir,[("pathway","module")], False)["pathway","module"]
    pathways={path:list(set(itertools.chain(*[links[egx] for egx in pathway if egx in links]))) for path, pathway in pathway_dict.iteritems()}
    return pathways

def plot_old_heatmap(subset_matrix,cmap,output_dir,path_name,core,colorbar_title="% pathway completeness"): #perc_steps,
    h2=sns.clustermap(subset_matrix, yticklabels=subset_matrix.index,row_cluster=False, col_cluster=False, xticklabels=subset_matrix.columns,cmap=cmap,linewidths=.5) #,cbar_kws={"boundaries":ticks}
    h2.cax.set_visible(False)

    for text in h2.ax_heatmap.get_yticklabels():
        text.set_rotation('horizontal')
    for text in h2.ax_heatmap.get_xticklabels():
        text.set_rotation('vertical')
        
    #Remove top axes where colorbar is by default and where the column dendrogram normally goes
    h2.fig.delaxes(h2.ax_col_dendrogram)
    h2.fig.delaxes(h2.cax)
    #Draw a colorbar where the row dendrogram normally is.
    #Reposition and shrink axis to allow divided axes to sit in right place later
    #h2.ax_row_dendrogram.set_position([0.16,0.125,0.2,0.6])
    
    #Break existing axis up into new subaxis
    #divider=make_axes_locatable(h2.ax_row_dendrogram)
    #new_cax=divider.append_axes("right",size="50%")
    #New attempt at breaking up into a subaxis
    inset_axes=inset_locator.inset_axes(h2.ax_row_dendrogram,width="20%",height="50%",loc=7) #10=centre, 7= centre right
    #new_cax=divider.append_axes("right",size="50%")

    #Plot colourbar on the sub axis
    colourbar=mpl.colorbar.ColorbarBase(inset_axes,cmap=cmap,orientation="vertical",drawedges=True)
    #colourbar.ax.tick_params(labelsize=20)
    colourbar.ax.yaxis.set_ticks_position('left') #Move ticks to left of colorbar
    colourbar.set_label(colorbar_title)
    colourbar.ax.yaxis.set_label_position('left') #Move title to left of colorbar
    ylabel=colourbar.ax.get_yticklabels()
    new_perc_labels=[str(perc) for perc in range(0,101,len(ylabel)-1)]
    #print [type(tick) for tick in colourbar.ax.get_yticklabels()]
    colourbar.ax.set_yticklabels(new_perc_labels)
#    print dir(colourbar)
    #h2.savefig(os.path.join(output_dir,'{}_grouped_microbes_modules_proportion.pdf'.format(path_name)))
    formats=["eps","svg","pdf","png"]
    core_name=os.path.join(output_dir,'{0}_{1}'.format(path_name,core))
    for img_format in formats:
        file_name=core_name+".{0}".format(img_format)
        print "Saving the file {0}".format(file_name)
        h2.savefig(file_name,format=img_format,bbox_inches="tight")
        
    fix_eps(os.path.join(output_dir,'{0}_{1}.eps'.format(path_name,core)))
    #h2.close()
    
    return None

def calc_best_ticks(top,perc_steps):
    print type(top)
    ticks=[round(top*perc,3) for perc in perc_steps]
    return ticks

def make_labels_from_ticks(ticks):
    labels=['']*9
    start=0
    end=0
    for i, tick in enumerate(ticks):
        end=str(tick)
        labels[i]="{0}-{1}".format(start,end)
    return labels
        
def plot_all_old_heatmaps(complete_matrix, output_dir,MO_pathways,bin_file_short,bin_file_long,core,transpose=False):
    #MO_pathways=pathway_to_modules(pathways, database_dir)
    
    bin_name=load_bin_names(bin_file)
    bin_name=defaultdict(lambda: "No associated taxonomy.",bin_name)
    #i=0
    bin_name_long=load_bin_names(bin_file)
    bin_name=defaultdict(lambda: "No associated taxonomy.",bin_name)
    
    new_comp_mat=complete_matrix.copy()
    new_comp_mat=new_comp_mat.sort_index(axis='columns')
    new_comp_mat.index.names=['Module','ModuleDescription']
    #print "The new readable index", new_comp_mat.index.get_level_values(0)
    new_comp_mat=new_comp_mat.sort_index(axis='column',by=)
    new_comp_mat=new_comp_mat.rename(columns=bin_name)
    new_comp_mat=new_comp_mat.sort_index(axis='columns')
#    cmap="Blues"
    #print new_comp_mat
    col_vals=new_comp_mat.columns
    new_order=list(col_vals[col_vals.isin(["Porites lutea","Symbiodinium strain C15"])])+list(col_vals[~col_vals.isin(["Porites lutea","Symbiodinium strain C15"])])
    new_comp_mat=new_comp_mat[new_order]
    for path_name,path_modules in MO_pathways.iteritems():
        modules=set(new_comp_mat.index.get_level_values(0))
        possible_modules=list(modules & set(path_modules))
        #subset_matrix=new_comp_mat.loc[possible_modules,:]
        selected_rows=pd.Series(new_comp_mat.index.get_level_values('Module')).isin(possible_modules)
        selected_rows.index=new_comp_mat.index
        #print "The selected rows", selected_rows
        subset_matrix=new_comp_mat.loc[selected_rows]
        #new_index={(multi_1,multi_2):multi_2 for multi_1,multi_2 in subset_matrix.index}
        #print new_index
        #subset_matrix=subset_matrix.rename(index=new_index)
        #subset_matrix=subset_matrix.reset_index(1).reset_index(drop=True) #Move 2nd index col into data frame, drop remain index
        subset_matrix.index=subset_matrix.index.droplevel('Module')
        #print subset_matrix.index
        if transpose:
            subset_matrix=pd.DataFrame.transpose(subset_matrix)
        plot_old_heatmap(subset_matrix,"Blues",output_dir, path_name,core)
    #df.xs(1, level='A', drop_level=False)
    
    return None

def fix_eps(fpath):
    """Fix carriage returns in EPS files caused by Arial font."""
    txt = b""
    with open(fpath, "rb") as f:
        for line in f:
            if b"\r\rHebrew" in line:
                line = line.replace(b"\r\rHebrew", b"Hebrew")
            txt += line
    with open(fpath, "wb") as f:
        f.write(txt)


def extract_matrix_subset(complete_matrix, modules):
    '''Extracts specific rows.'''
    subset_matrix=complete_matrix.loc[modules]
    return subset_matrix


def plot_scaling(MDS_data,y,x,file_name):
    #sns.set_style('ticks')
    sns.set(style='darkgrid')
    fig,ax=plt.subplots(figsize=(15,10.5))
    cur_plot=sns.regplot(x=x,y=y,data=MDS_data,fit_reg=False,scatter=True,scatter_kws={'s':30})

    for index,row in MDS_data.iterrows():
        cur_plot.text(row[x],row[y],row['group'],fontsize=20)
    legend_labels=MDS_data.Tax.unique() #Unique tax strings
    tax_ind_data=MDS_data[['Tax','group']].copy()
    tax_ind_data=tax_ind_data.reset_index(drop=True)
    tax_ind_data=tax_ind_data.set_index(['Tax'])
    tax_ind_data=tax_ind_data.drop_duplicates()
    tax_ind_data['group']=pd.to_numeric(tax_ind_data['group'])
    tax_ind_data.sort_values(by='group',inplace=True)
    tax_ind_data['group']=tax_ind_data['group'].apply(str)

    labels=tax_ind_data['group']
    description=tax_ind_data.index    
    proxies=[create_proxy(item,'black') for item in labels]
    lgd=plt.legend(proxies,description, numpoints=1,markerscale=2,loc=2,bbox_to_anchor=(1.0,1), borderaxespad=0.)
    #cur_plot.savefig(os.path.join(output_dir,"test_MDS.pdf"),bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".pdf",format="pdf",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".svg",format="svg",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".eps",format="eps",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".png",forat="png",bbox_extra_artists=(lgd,),bbox_inches='tight')
    fix_eps(file_name+".eps")
    plt.close()
    return None

def create_proxy(label,colour):
    #from http://stackoverflow.com/questions/28739608/completely-custom-legend-in-matplotlib-python
    #line=matplotlib.lines.Line2D([0],[0],linestyle='none',mfc=colour,mec='none',marker=r'$\mathregular{{{}}}'.format(label))

    line=matplotlib.lines.Line2D([0],[0],linestyle='none',mfc=colour,mec='none',marker=r'$\mathregular{{{}}}$'.format(label))
    return line



def plot_scalings(orig_data,output_dir,tax_rel):
    #remove module descriptions
    original_data=orig_data.copy()
    original_data.index=original_data.index.droplevel(1)
    genome_ids=pd.Series(original_data.columns,index=original_data.columns).map(tax_rel)
    #print genome_ids
    genome_ids.sort_values(inplace=True)
    labels,levels=pd.factorize(genome_ids)
    #print labels
    unique_IDS=pd.Series(labels,index=genome_ids.index).astype('string')
    ID_df=pd.DataFrame(genome_ids)
    ID_df['ID']=unique_IDS
    ID_df.columns=['Tax','ID']
    
    PCA_data,PCA_model=perform_PCA(original_data)
    ISOMAP_data=perform_ISOMAP(original_data)
    MDS_data=perform_MDS(original_data)
    
    PCA_data['group']=ID_df['ID']
    ISOMAP_data['group']=ID_df['ID']
    MDS_data['group']=ID_df['ID']
    
    PCA_data['Tax']=ID_df['Tax']
    ISOMAP_data['Tax']=ID_df['Tax']
    MDS_data['Tax']=ID_df['Tax']
    
    pca_cols=PCA_data.columns[0:3]
    isomap_cols=ISOMAP_data.columns[0:3]
    mds_cols=MDS_data.columns[0:3]
    
    
    PCA_data=PCA_data.sort_index(axis='columns')
    ISOMAP_data=ISOMAP_data.sort_index(axis='columns')
    MDS_data=MDS_data.sort_index(axis='columns')
    
    i=0
    name=["PCA_Plot","ISOMAP_plot",'EUC_MDS_plots']
    col_vals=[pca_cols,isomap_cols,mds_cols]
    for data_values in itertools.chain([PCA_data,ISOMAP_data,MDS_data]):
        file_name=os.path.join(output_dir, name[i])
        for comp1,comp2 in itertools.combinations(col_vals[i],2):
            lower_file_name=file_name+comp1+comp2
            #print comp1, comp2
            #print data_values
            #data_values=data_values.sort_index(axis='rows')
            #data_values=data_values.sort_index(axis='columns')
            plot_scaling(data_values,comp1,comp2,lower_file_name)
        i+=1
    

    file_name=os.path.join(output_dir,"PCA_MDS_two_panel_plot")
    plot_scaling_two_panel_plot(MDS_data,PCA_data,file_name)
    
    return None

def plot_scaling_two_panel_plot(MDS_data,PCA_data,file_name):
    fig,(ax1,ax2)=plt.subplots(1,2)
    sns.set(style='darkgrid')
    plot_1=sns.regplot(x='Dim1',y='Dim2',data=MDS_data,fit_reg=False,scatter=True,scatter_kws={'s':30},ax=ax2)
    for index,row in MDS_data.iterrows():
        plot_1.text(row['Dim1'],row['Dim2'],row['group'],fontsize=10)

    plot_2=sns.regplot(x='PC1',y='PC2',data=PCA_data,fit_reg=False,scatter=True,scatter_kws={'s':30},ax=ax1)

    for index,row in PCA_data.iterrows():
        plot_2.text(row['PC1'],row['PC2'],row['group'],fontsize=10)

    legend_labels=MDS_data.Tax.unique() #Unique tax strings
    tax_ind_data=MDS_data[['Tax','group']].copy()
    tax_ind_data=tax_ind_data.reset_index(drop=True)
    tax_ind_data=tax_ind_data.set_index(['Tax'])
    tax_ind_data=tax_ind_data.drop_duplicates()
    tax_ind_data['group']=pd.to_numeric(tax_ind_data['group'])
    tax_ind_data.sort_values(by='group',inplace=True)
    tax_ind_data['group']=tax_ind_data['group'].apply(str)

    labels=tax_ind_data['group']
    description=tax_ind_data.index    
    proxies=[create_proxy(item,'black') for item in labels]
    lgd=plt.legend(proxies,description, numpoints=1,markerscale=1.25,loc=2,bbox_to_anchor=(1.0,1), borderaxespad=0.,prop={'size':8})
    #plt.close()
    plt.savefig(file_name+".pdf",format="pdf",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".svg",format="svg",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".eps",format="eps",bbox_extra_artists=(lgd,),bbox_inches='tight')
    plt.savefig(file_name+".png",forat="png",bbox_extra_artists=(lgd,),bbox_inches='tight')
    fix_eps(file_name+".eps")
    return None
def plotting_wf(completes_matrix, tax_file, relevant_pathways):
    
    return

def load_completeness_matrix(completeness_file):
    df=pd.read_csv(completeness_file,index_col=[0,1],skipinitialspace=True,sep="\t")
    
    return df

def load_euk_repeat_data_matrix(euk_repeat_file):
    df=pd.read_csv(euk_repeat_file,index_col=[0,1,2],skipinitialspace=True,sep="\t")
    
    return df

def create_euk_repeat_heatmaps(euk_repeat_matrix,output_dir,cmap,names):
    repeat_data=euk_repeat_matrix.copy()
    repeat_data.index=repeat_data.index.droplevel(0)
    repeat_data=repeat_data.loc[:,(repeat_data!=0).any(axis=0)] #Remove all columns with no hits.
    repeat_data=repeat_data.sort_index(axis='rows')
    #names=["TPR",'Ank__VCBS__T2SSE__TAL_effector','W']
    column_dict={}
    n_cols=repeat_data.shape[1]
    other_columns=pd.Series([False]*n_cols,index=repeat_data.columns)
    for name in names:
        options=name.split("__")
        empty_series=pd.Series([False]*n_cols,index=repeat_data.columns)
        for option in options:
            empty_series=empty_series | pd.Series(repeat_data.columns.str.startswith(option),index=repeat_data.columns)
        #log_series=pd.Series(repeat_data.columns.str.startswith(name),index=repeat_data.columns)
        #log_series.index=repeat_data.columns
        #print len(log_series)
        column_dict[name]=empty_series

    for bool_series in column_dict.itervalues():
        other_columns=other_columns | bool_series
    other_columns=~other_columns

    #print len(other_columns)

    column_dict["other"]=other_columns
    cmap="Blues"
    for item_name,logical_series in column_dict.iteritems():
        #print repeat_data.shape[1]
        #print item_name, logical_series
        subset_matrix=repeat_data.ix[:,logical_series]
        print subset_matrix.shape[1]
        #subset_matrix=pd.DataFrame.transpose(subset_matrix)
        plot_heatmap(subset_matrix,cmap,output_dir,item_name,"Euk_repeat_Hmm_searches",cluster_rows=False)
        
    return None

def split_df_by_column_names(df,names):
    column_dict={}
    sub_df_dict={}
    n_cols=df.shape[1]
    other_columns=pd.Series([False]*n_cols,index=df.columns)
    for name in names:
        options=name.split("__")
        empty_series=pd.Series([False]*n_cols,index=df.columns)
        for option in options:
            empty_series=empty_series | pd.Series(df.columns.str.startswith(option),index=df.columns)
        #log_series=pd.Series(repeat_data.columns.str.startswith(name),index=repeat_data.columns)
        #log_series.index=repeat_data.columns
        #print len(log_series)
        column_dict[name]=empty_series

    for bool_series in column_dict.itervalues():
        other_columns=other_columns | bool_series
    other_columns=~other_columns
    column_dict["other"]=other_columns
    for item_name,logical_series in column_dict.iteritems():
        #print repeat_data.shape[1]
        #print item_name, logical_series
        sub_df_dict[item_name]=df.ix[:,logical_series]
    return sub_df_dict


def split_df_by_row_names(df,names):
    rows_dict={}
    sub_df_dict={}
    n_cols=df.shape[0]
    other_rows=pd.Series([False]*n_cols,index=df.index)
    for name in names:
        options=name.split("__")
        empty_series=pd.Series([False]*n_cols,index=df.index)
        for option in options:
            empty_series=empty_series | pd.Series(df.index.str.startswith(option),index=df.index)
        #log_series=pd.Series(repeat_data.columns.str.startswith(name),index=repeat_data.columns)
        #log_series.index=repeat_data.columns
        #print len(log_series)
        rows_dict[name]=empty_series

    for bool_series in rows_dict.itervalues():
        other_rows=other_rows | bool_series
    other_rows=~other_rows
    rows_dict["other"]=other_rows
    for item_name,logical_series in rows_dict.iteritems():
        #print repeat_data.shape[1]
        #print item_name, logical_series
        sub_df_dict[item_name]=df.ix[logical_series,:]
    return sub_df_dict

def split_df_by_column_values(df, breakpoint_values):
    col_max=df.max(axis=0) #Get column maximum
    #print type(col_max)
    i=0
    start=0
    split_dfs={}
    for j in xrange(0,len(breakpoint_values)+1):
        if j!=len(breakpoint_values):
            end=breakpoint_values[j]
            df_id="{0}<=X<{1}".format(start,end)
            des_columns=(start<=col_max) & (end>col_max)
            #print des_columns
            if des_columns.any():
                split_dfs[df_id]=df.ix[:,des_columns]
            start=breakpoint_values[j]
        else:
            df_id="{0}<=X".format(start)
            des_columns=(start<=col_max)
            #print des_columns
            if des_columns.any():
                split_dfs[df_id]=df.ix[:,des_columns]
    
    return split_dfs

def split_df_by_row_values(df, breakpoint_values):
    col_max=df.max(axis=1) #Get column maximum
    #print type(col_max)
    i=0
    start=0
    split_dfs={}
    for j in xrange(0,len(breakpoint_values)+1):
        if j!=len(breakpoint_values):
            end=breakpoint_values[j]
            df_id="{0}<=X<{1}".format(start,end)
            des_columns=(start<=col_max) & (end>col_max)
            #print des_columns
            if des_columns.any():
                split_dfs[df_id]=df.ix[des_columns,:]
            start=breakpoint_values[j]
        else:
            df_id="{0}<=X".format(start)
            des_columns=(start<=col_max)
            #print des_columns
            if des_columns.any():
                split_dfs[df_id]=df.ix[:,des_columns]
            
    return split_dfs

def any_greater_than(matrix,value):
    new_matrix=matrix.copy()
    new_matrix=new_matrix.loc[(new_matrix>=value).any(axis=1),:]
    return new_matrix

def load_counts(file_name):
    df=pd.read_csv(file_name,sep="\t",index_col=0,header=None)
    df.columns=["TotalGene#"]
    df.index.name="Genome_id"
    return df

def discretize_matrix(matrix,ticks):
    discretised_matrix=matrix.copy()
    num_ints=len(ticks)
    tick_labels=make_tick_labels(ticks)
    disc_val=0
    for i,j in zip(ticks,ticks[1:]):
        discretised_matrix[(i<=matrix) & (matrix<j)]=disc_val
        disc_val+=1
    discretised_matrix[(matrix>j)]=disc_val
    return discretised_matrix
        
        
def make_tick_labels(ticks):
    lables=[]
    for i,j in zip(ticks,ticks[1:]):
        lables.append("{0}-{1}".format(i,j))
    lables.append(">={0}".format(ticks[-1]))
    return lables

def plot_discrete_heatmap(matrix,ticks,cmap,output_dir,path_name,core,colorbar_title="% Relative abundance",labels=None):
    discrete_data=discretize_matrix(matrix,ticks)
    #discrete_data=discrete_data/max(discrete_data.max(axis=1))
    if isinstance(labels,type(None)):
        tick_labels=make_tick_labels(ticks)
    else:
        tick_labels=labels
    n_ticks=len(ticks)
    n_colours=n_ticks
    #cmap="Blues"
    cmap=sns.cubehelix_palette(len(ticks),rot=-0.3)
    cmap[0]=[1]*3
    #test_cmap=sns.cubehelix_palette(len(ticks),rot=-0.3,as_cmap=True)
    cmap=[tuple(cmapped) for cmapped in cmap]
    cm = LinearSegmentedColormap.from_list(
        "CubeHelix_Discrete", colors=cmap, N=n_ticks)
    h2=sns.clustermap(discrete_data, yticklabels=discrete_data.index,\
                      row_cluster=False, col_cluster=False, \
                      xticklabels=discrete_data.columns,\
                      cmap=cm,linewidths=.5) #,cbar_kws={"boundaries":ticks}
    h2.cax.set_visible(False)
    #plt.yticks(rotation=0)
    #plt.xticks(rotation=45)
    #plt.tight_layout()
    #colorbar = h2.collections[0].colorbar
    #colorbar.set_ticks(ticks)
    #colorbar.set_ticklabels(labels)

    #h2.ax_heatmap.set_title(path_name)
    for text in h2.ax_heatmap.get_yticklabels():
        text.set_rotation('horizontal')
    for text in h2.ax_heatmap.get_xticklabels():
        text.set_rotation('vertical')

    h2.fig.delaxes(h2.ax_col_dendrogram)
    h2.fig.delaxes(h2.cax)

    #Define a colour bar in the appropiate way.
    #h2.ax_row_dendrogram.set_position([0.16,0.125,0.2,0.6])
    #divider=make_axes_locatable(h2.ax_row_dendrogram)
    #new_cax=divider.append_axes("right",size="50%")
    inset_axes=inset_locator.inset_axes(h2.ax_row_dendrogram,width="20%",height="50%",loc=7) #10=centre, 7= centre right
    colourbar=mpl.colorbar.ColorbarBase(inset_axes,cmap=cm,orientation="vertical",drawedges=True)
    #colourbar.ax.tick_params(labelsize=20)
    colourbar.ax.yaxis.set_ticks_position('left')
    tick_position=np.linspace(0.5, n_colours-0.5, n_colours)
    tick_position=tick_position/n_colours#-float(0.25)/(n_ticks)
    colourbar.set_label(colorbar_title)
    colourbar.ax.yaxis.set_label_position('left') #Move title to left of colorbar
    #print tick_position
    #print tick_labels
    colourbar.set_ticks(tick_position)
    colourbar.set_ticklabels(tick_labels)
    
    formats=["eps","svg","pdf","png"]
    core_name=os.path.join(output_dir,'{0}_{1}'.format(path_name,core))
    for img_format in formats:
        file_name=core_name+".{0}".format(img_format)
        print "Saving the file {0}".format(file_name)
        h2.savefig(file_name,format=img_format,bbox_inches="tight")
        
    fix_eps(os.path.join(output_dir,'{0}_{1}.eps'.format(path_name,core)))
    
    return

def load_new_multiheader_hmm_searches(hmm_multiind_file,genome_taxonomy):
    df=pd.read_csv(hmm_multiind_file,index_col=[0,1],header=[0,1],sep="\t",skipinitialspace=True)
    df=df.fillna(0)
    df=df.loc[:,(df!=0).any(axis=0)] #Remove all columns with no hits.
    df.index=df.index.droplevel(1)
    df.columns=df.columns.droplevel(0)
    df.index=df.index.str.replace("plut.*","coral").str.replace("SymbC15.*","SymbC15")
    df=df.rename(index=genome_taxonomy)
    df=df.sort_index(axis='rows')
    return df

def create_multiheader_hmm_discrete_heatmap(hmm_matrix, output_dir,cmap,row_splits,matrix_ticks,core_name,labels):
    sep_assemblies=split_df_by_row_names(hmm_matrix,['filtered__unfiltered'])
    sep_assemblies=defaultdict(dict,sep_assemblies)
    #for name, df in sep_assemblies.items():
    #    sep_assemblies[name]=split_df_by_column_values(df, column_splits[name])
    
    for name, df in sep_assemblies.iteritems():
        item_name=name
        plot_discrete_heatmap(df,matrix_ticks[name],cmap,output_dir,item_name,core_name,labels[name])
    return None

def load_counts(file_name):
    df=pd.read_csv(file_name,sep="\t",index_col=0,header=None)
    df.columns=["TotalGene#"]
    df.index.name="Genome_id"
    return df

def normalise_euk_data(euk_repeat_file, gene_totals_file,uniq_tax_id_file):
    repeat_data=load_euk_repeat_data_matrix(euk_repeat_file)
    repeat_data.index=repeat_data.index.droplevel("Taxonomy")
    repeat_data.index=repeat_data.index.droplevel('Gene_name')
    gene_totals=load_counts(gene_totals_file)
    repeat_data=repeat_data.loc[:,(repeat_data!=0).any(axis=0)] #Remove all columns with no hits.
    repeat_data[repeat_data>0]=1
    repeat_data[repeat_data==0]=0
    #print repeat_data
    repeat_data=repeat_data.groupby(level=0).sum()
    #print repeat_data
    #print gene_totals.index
    repeat_data=repeat_data.divide(gene_totals.iloc[:,0]/100, axis='index')
    repeat_data=repeat_data.sort_index(axis='rows')
    #print repeat_data
    #names=["TPR",'Ank__VCBS__T2SSE__TAL_effector','W']
    taxa=load_bin_names(uniq_tax_file)
    taxa=defaultdict(lambda : "No Associated taxonomy", taxa)
    repeat_data.index=repeat_data.index.map(lambda x: taxa[x])
    sub_dfs=split_df_by_column_names(repeat_data,["Ank","WD40","VCBS"])
    sub_dfs["Ank"]=sub_dfs["Ank"].loc[:,"Ank"]
    sub_dfs["WD40"]=sub_dfs["WD40"].loc[:,"WD40"]
    sub_dfs["VCBS"]=sub_dfs["VCBS"].loc[:,"VCBS"]
    del sub_dfs["other"]
    for name,series in sub_dfs.items():
        sub_dfs[name]=sub_dfs[name].sort_index()
    return sub_dfs

def create_gene_perc_bar_charts(sub_dfs,cmap,output_dir,core):
    plt.rcParams['figure.figsize']=(8,5)
    for protein_repeat,data_series in sub_dfs.iteritems():

        fig,ax=plt.subplots()
        sns.barplot(x=data_series.index,y=data_series.values,ax=ax,color = "grey")
        locs,labels=plt.xticks()
        plt.setp(labels,rotation=90)
        plt.title(protein_repeat)
        plt.ylabel("% of all genes hit")
        if "ank" in protein_repeat.lower():
            plt.axhline(y=0.25, xmin=0, xmax=1, hold=None,color='black',linewidth=2)
            #plt.axhline(y=0.20, xmin=0, xmax=1, hold=None,color='black',linewidth=2)
        fig.savefig(os.path.join(output_dir,'{0}_{1}.eps'.format(protein_repeat,core)),format="eps",bbox_inches="tight")
        fig.savefig(os.path.join(output_dir,'{0}_{1}.svg'.format(protein_repeat,core)),format="svg",bbox_inches="tight")
        fig.savefig(os.path.join(output_dir,'{0}_{1}.pdf'.format(protein_repeat,core)),format="pdf",bbox_inches="tight")
        fig.savefig(os.path.join(output_dir,'{0}_{1}.png'.format(protein_repeat,core)),format="png",bbox_inches="tight")
        fix_eps(os.path.join(output_dir,'{0}_{1}.eps'.format(protein_repeat,core)))
        #plt.show()
    return

def histogram_figures(euk_repeat_file, gene_totals_file,uniq_tax_id_file,output_dir):
    sub_dfs=normalise_euk_data(euk_repeat_file, gene_totals_file,uniq_tax_id_file)
    create_gene_perc_bar_charts(sub_dfs,"Blacks",output_dir,"perc_gene_hits_barchart")
    return
    
def grepl(iterable,pattern):
    return [bool(re.search(pattern,x)) for x in iterable]

def process_binning_improvement_data(binning_data_file):
    binning_improvement_data=pd.read_csv(binning_data_file,index_col=[0,1],sep=",")
    binning_improvement_data.index=binning_improvement_data.index.droplevel("Steve_taxonomy")
    
    completeness=binning_improvement_data.iloc[:,0::2]
    contamination=binning_improvement_data.iloc[:,1::2]
    completeness=pd.melt(completeness,id_vars=["Final_Completeness"],var_name="binning_algorithm")
    completeness.iloc[:,1]=completeness.iloc[:,1].str.replace("_completeness","")
    completeness["%Improvement"]=completeness["Final_Completeness"]-completeness["value"]
    
    contamination=pd.melt(contamination,id_vars=["Final_Contamination"],var_name="binning_algorithm")
    contamination.iloc[:,1]=contamination.iloc[:,1].str.replace("_contamination","")
    contamination["%Improvement"]=contamination["value"]-contamination["Final_Contamination"]

    contamination["Measure"]="Contamination"
#contamination
    completeness["Measure"]="Completeness"

    output_data=pd.concat([completeness,contamination])
    #output_dict={}
    #output_dict["completeness"]=completeness
    #output_dict["contamination"]=contamination
    return output_data#ict

def plot_completeness_contamination(input_data,output_dir,core):
    #for item, values in input_data.iteritems():
    fig,ax=plt.subplots()
    sns.barplot(x="binning_algorithm",y="%Improvement",data=input_data,hue="Measure",ax=ax)

    item="Binning_statistics"
    fig.savefig(os.path.join(output_dir,'{0}_{1}.eps'.format(item,core)),format="eps",bbox_inches="tight")
    fig.savefig(os.path.join(output_dir,'{0}_{1}.svg'.format(item,core)),format="svg",bbox_inches="tight")
    fig.savefig(os.path.join(output_dir,'{0}_{1}.pdf'.format(item,core)),format="pdf",bbox_inches="tight")
    fig.savefig(os.path.join(output_dir,'{0}_{1}.png'.format(item,core)),format="png",bbox_inches="tight")
    fix_eps(os.path.join(output_dir,'{0}_{1}.eps'.format(item,core)))
def create_improvement_barplots(binning_data_file,output_dir):
    input_data=process_binning_improvement_data(binning_data_file)
    plot_completeness_contamination(input_data,output_dir,"_improvements")
    return
        
    

In [10]:
euk_repeat_file=os.path.join(*[core,g_drive,"HMM_searches","Symbioses_test","euk_repeat_results","hmm_hits_per_gene_per_genome.tsv"])

gene_totals_file=os.path.join(*[core,g_drive,"HMM_searches","gene_counts.tsv"])

histogram_figures(euk_repeat_file, gene_totals_file,uniq_tax_file,plots_dir)

In [180]:
repeat_data=load_euk_repeat_data_matrix(euk_repeat_file)
repeat_data.index=repeat_data.index.droplevel("Taxonomy")
repeat_data.index=repeat_data.index.droplevel('Gene_name')
gene_totals=load_counts(gene_totals_file)
repeat_data=repeat_data.loc[:,(repeat_data!=0).any(axis=0)] #Remove all columns with no hits.
repeat_data[repeat_data>0]=1
repeat_data[repeat_data==0]=0
#print repeat_data
repeat_data=repeat_data.groupby(level=0).sum()
#print repeat_data
#print gene_totals.index
repeat_data=repeat_data.divide(gene_totals.iloc[:,0]/100, axis='index')
repeat_data=repeat_data.sort_index(axis='rows')
#print repeat_data
#names=["TPR",'Ank__VCBS__T2SSE__TAL_effector','W']
taxa=load_bin_names(uniq_tax_file)
taxa=defaultdict(lambda : "No Associated taxonomy", taxa)
repeat_data.index=repeat_data.index.map(lambda x: taxa[x])
sub_dfs=split_df_by_column_names(repeat_data,["Ank","WD40","VCBS"])
sub_dfs["Ank"]=sub_dfs["Ank"].loc[:,"Ank"]
sub_dfs["WD40"]=sub_dfs["WD40"].loc[:,"WD40"]
sub_dfs["VCBS"]=sub_dfs["VCBS"].loc[:,"VCBS"]
del sub_dfs["other"]
for name,series in sub_dfs.items():
    sub_dfs[name]=sub_dfs[name].sort_index()


In [19]:
improvements_file=os.path.join(*[core,g_drive, "BetterBins","GTDB_all_bin_comparison_csv_file.csv"])

create_improvement_barplots(improvements_file, plots_dir)

In [21]:
plt.close()

In [8]:
#print m

plot_scalings(m,plots_dir,genome_taxonomy)

In [12]:
enrich_output_dir=os.path.join(*[core,g_drive,"metabolic_analysis","enriched_hits"])
m=pd.read_csv(os.path.join(enrich_output_dir,"Genome_module_completeness_matrix_all_orgs.tsv"),\
              index_col=[0,1],skipinitialspace=True,sep="\t")

subset_dir=os.path.join(*[output_dir,"subset_data"])
test_data=os.path.join(*[subset_dir,"nitrogen-sulfur-fatty_acid-photosynthesis.tsv"])
n=pd.read_csv(test_data,sep="\t",index_col=[0])

#Simplify the string names
Fullregex1="(two-component regulatory system|transport system)"
Fullregex2="(.*,) (?!(ammonia|sulfate|nitrate|thiosulfate))"
#print p.index
new_levels=m.index.get_level_values(1).str.replace(Fullregex1,"").str.replace(Fullregex2,"")
#print len(p.index)
#print new_levels

new_index=pd.MultiIndex.from_tuples(list(zip(*[m.index.levels[0],new_levels])))

#print len(new_index)
#print len(p.index.levels[0])
##print len(new_levels)
#print p.index.get_level_values(1)
m.index=new_index


In [105]:
def create_subdfs(complete_matrix, output_dir,pathways,bin_file):
    MO_pathways=pathway_to_modules(pathways, database_dir)
    bin_name=load_bin_names(bin_file)
    bin_name=defaultdict(lambda: "No associated taxonomy.",bin_name)
    #i=0
    new_comp_mat=complete_matrix.copy()
    new_comp_mat=new_comp_mat.sort_index(axis='columns')
    new_comp_mat.index.names=['Module','ModuleDescription']
    #print "The new readable index", new_comp_mat.index.get_level_values(0)
    new_comp_mat=new_comp_mat.rename(columns=bin_name)
    new_comp_mat=new_comp_mat.sort_index(axis='columns')
    cmap="Blues"
    #print new_comp_mat
    for path_name,path_modules in MO_pathways.iteritems():
        modules=set(new_comp_mat.index.get_level_values(0))
        possible_modules=list(modules & set(path_modules))
        #subset_matrix=new_comp_mat.loc[possible_modules,:]
        selected_rows=pd.Series(new_comp_mat.index.get_level_values('Module')).isin(possible_modules)
        selected_rows.index=new_comp_mat.index
        #print "The selected rows", selected_rows
        subset_matrix=new_comp_mat.loc[selected_rows].copy()
        #new_index={(multi_1,multi_2):multi_2 for multi_1,multi_2 in subset_matrix.index}
        #print new_index
        #subset_matrix=subset_matrix.rename(index=new_index)
        #subset_matrix=subset_matrix.reset_index(1).reset_index(drop=True) #Move 2nd index col into data frame, drop remain index
        subset_matrix.index=subset_matrix.index.droplevel('Module')
        #print subset_matrix.index
        output_file=os.path.join(output_dir,path_name+".tsv")
        subset_matrix.to_csv(output_file,sep="\t")
        
    

In [9]:
output_dir

'C:\\Users\\Baker\\Google Drive\\Honours\\metabolic_analysis'

In [13]:
MO_pathways=pathway_to_modules(pathways,database_dir)

operator_module_dict={"Remove":{'amino-acids':[],
 'CationicAntiomicrobialPeptide_CAMP_resistance':[],
 'Vancomycin_Beta-lactamResistance':[],
 'Glycosaminoglycan degradation & Synthesis':[],
 'N-Glycan biosynthesis':[],
 'two-component':["M00315","M00449","M00455","M00459","M00461",\
                  "M00462","M00467","M00468","M00471","M00472",\
                  "M00474","M00475","M00480","M00483","M00485",\
                  "M00489","M00492","M00501","M00502","M00504",\
                  "M00506","M00507","M00510","M00512","M00513",\
                  "M00516","M00517","M00645","M00651","M00716",\
                  "M00770","M00772","M00646","M00648","M00511",\
                  "M00477"],
 'vitamins&cofactors':["M00141","M00140","M00622","M00811"],
 'AminoAcidMetabolism':["M00032","M00135","M00136","M00047",\
                        "M00134","M00133","M00044","M00533",\
                        "M00043","M00042","M00545","M00350",\
                        "M00037","M00741","M00741","M00033",\
                        "M00034","M00035","M00368","M00036",\
                        "M00038","M00045"],
                                
 'ABC transporters':["M00185","M00191","M00762","M00739",\
                     "M00635","M00607","M00606","M00605",\
                     "M00603","M00600","M00599","M00593",\
                     "M00592","M00590","M00585","M00584",\
                     "M00581","M00566","M00440","M00438",\
                     "M00436","M00423","M00325","M00323",\
                     "M00321","M00319","M00302","M00252",\
                     "M00251","M00250","M00246","M00245",\
                     "M00238","M00234","M00233","M00230",\
                     "M00228","M00227","M00225","M00224",\
                     "M00223","M00220","M00219","M00217",\
                     "M00206","M00210","M00209","M00317",\
                     "M00203","M00198","M00200"],
 'nitrogen-sulfur-fatty_acid-photosynthesis':["M00082","M00083","M00085",\
                                              "M00415","M00086","M00087",\
                                              "M00157","M00161","M00162",
                                              "M00163"],
 'carbon':[],
 'phosphotransferase system (PTS)':[],
 'oxidative_phosphorylation':[],
 'Bacterial Secretion Systems':["M00325","M00335","M00336"], #,"M00336" Tat sys
    "Sulfatases":[]
    },"Add":{'amino-acids':[],
 'CationicAntiomicrobialPeptide_CAMP_resistance':[],
 'Vancomycin_Beta-lactamResistance':[],
 'Glycosaminoglycan degradation & Synthesis':[],
 'N-Glycan biosynthesis':[],
 'two-component':[],
 'vitamins&cofactors':["L-Glutamate=>Uropor-phyrinogen III","L-Threonine=>VitaminB12Coenzyme",\
                       "Precorrin 2=>Cob(II)yrinatea,cdiamide via CoPrecorrin",\
                       "Precorrin 2=>Cob(II)yrinatea,cdiamide via Precorrin",\
                       "Uropor-phyrinogen III=>Precorrin 2"],
 'AminoAcidMetabolism':["Pyruvate=>Alanine","Aspartate=>Asparagine","Oxaloacetate=>Aspartate",\
                            "Glutamate=>Glutamine","2-Oxoglutarate=>GlutamicAcid",\
                           "Threonine=>Glycine","Serine=>Glycine",
                           "Ornithine=>Arginine","Hydroxy-Pyruvate=>Serine","Glycine=>Serine"\
                       "Glutamate=>Proline_v2","Phenylalanine=>Tyrosine","Ornithine=>Proline"],
 'ABC transporters':[],
 'nitrogen-sulfur-fatty_acid-photosynthesis':["DMSP=>3-(Methylthio)-propanoate(dmdA)",
                                             "DMSP=>DMS","DMSO=>DMS","DMS=>DMSO","Nitrite=>Nitrate"],
 'carbon':[],
 'phosphotransferase system (PTS)':[],
 'oxidative_phosphorylation':[],
 'Bacterial Secretion Systems':[],
  "Sulfatases":["arylsulfatase B",
    "N-sulfoglucosamine sulfohydrolase",
    "N-acetylglucosamine-6-sulfatase",
    "N-acetylgalactosamine-6-sulfatase",
                "iduronate 2-sulfatase"]
    }}

def add_remove_modules(pathway_modules,operator_dictionary):
    for operator, values in operator_dictionary.iteritems():
        if operator=="Add":
            for pathway,items in values.iteritems():
                pathway_modules[pathway]=list(set(pathway_modules[pathway]) | set(items))
        elif operator=="Remove":
            for pathway,items in values.iteritems():
                pathway_modules[pathway]=list(set(pathway_modules[pathway]) -set(items))
            
    return pathway_modules

MO_pathways=add_remove_modules(MO_pathways,operator_module_dict)

In [20]:
plot_all_old_heatmaps(m, os.path.join(output_dir,"Plots"),MO_pathways,uniq_tax_file,"microbes_modules_proportion",transpose=True)
#plot_all_heatmaps(m, os.path.join(output_dir,"Plots"),pathways,uniq_tax_file,"microbes_modules_proportion",[0,20,50,60,75,85,100],True)

#micro_data=make_microbe_df(m)
#plot_all_heatmaps(micro_data, os.path.join(output_dir,"Plots"),pathways,tax_file,"grouped_microbes_modules_proportion",True)

#plot_euk_enrichment_data()

#plot_dimensional_scalings()
plt.close()

Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\oxidative_phosphorylation_microbes_modules_proportion.eps
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\oxidative_phosphorylation_microbes_modules_proportion.svg
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\oxidative_phosphorylation_microbes_modules_proportion.pdf
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\oxidative_phosphorylation_microbes_modules_proportion.png
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\Bacterial Secretion Systems_microbes_modules_proportion.eps
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\Bacterial Secretion Systems_microbes_modules_proportion.svg
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\Bacterial Secretion Systems_microbes_modules_proportion.pdf
Saving the file C:\Users\Baker\Google Drive\Honou

In [None]:
plt.close()

In [59]:
enriched_tax_data_ramciotti="dummy"
enriched_tax_data_plutea=""
unenriched_tax_file=os.path.join(*[core,g_drive,"Honours_Final_report","Relevant_data","926_1392_SeaquenceAllCorals_DNAExtractionTest_16Samplicons_forheatmap.txt"])
unenriched_tax_data=pd.read_csv(unenriched_tax_file, sep="\t",index_col=[0,-1])

In [8]:
def non_zero_row(row):
    return any(row>0)

def prepare_unenriched_otu_hits(unenriched_tax_data,threshold):
    #First, remove all rows with zero hits
    unenriched_tax_data=unenriched_tax_data.ix[unenriched_tax_data.apply(non_zero_row,axis=1),:]
    #print unenriched_tax_data.index.get_level_values("Top Blast Hit")
    simplified_tax=unenriched_tax_data.index.get_level_values("Top Blast Hit").str.replace("[gsfocpdk]__(?:;|$|\n)",'')\
    .str.replace(" ","").str.replace(";$","").str.replace("k__.*?;","").str.replace(";(.*);",";")

    unenriched_tax_data=unenriched_tax_data.reset_index()
    #print len(simplified_tax)
    #print len(unenriched_tax_data)
    #print unenriched_tax_data["Top Blast Hit"]
    unenriched_tax_data["Top Blast Hit"]=simplified_tax
    
    #For duplicate ids add an integer to make it unique
    make_uniq_id(unenriched_tax_data,"Top Blast Hit")
    del unenriched_tax_data["OTU"]
    unenriched_tax_data=unenriched_tax_data.set_index(keys=["Top Blast Hit"])
    #Normalise the table
    unenriched_tax_data=unenriched_tax_data.divide(unenriched_tax_data.sum(axis=0)/100, axis='columns')
    unenriched_tax_data=unenriched_tax_data.round(decimals=2)
    unenriched_tax_data=remove_less_than(unenriched_tax_data,threshold)
    
    existing_titles=unenriched_tax_data.columns#[u'PL.Pre.Lysis.1', u'PL.Pre.Lysis.2', u'PL.Pre.Lysis.3']
    
    new_titles={existing_title:"P. lutea-{0}".format(existing_title.split(".")[-1]) for existing_title in existing_titles}
    
    unenriched_tax_data=unenriched_tax_data.rename(columns=new_titles)
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=1)
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=0)
    
    return unenriched_tax_data

def remove_less_than(unenriched_tax_data,threshold):
    def any_row_more(row):
        return any(row>=threshold)
    
    def all_row_more(row):
        return all(row>=threshold)
    
    unenriched_tax_data=unenriched_tax_data.ix[unenriched_tax_data.apply(any_row_more,axis=1),:]
    
    return unenriched_tax_data

def make_uniq_id(unenriched_tax_data,column):
    seen_taxa=set([])
    #col_name="Top Blast Hit"
    taxa_count=defaultdict(int)
    for index,row in unenriched_tax_data.copy().iterrows():
        taxa=row[column]
        if taxa not in seen_taxa:
            taxa_count[taxa]+=1
            seen_taxa.add(taxa)
            taxa=taxa+"_{0}".format(taxa_count[taxa])
            unenriched_tax_data.set_value(index, column,taxa)
        else:
            taxa_count[taxa]+=1
            taxa=taxa+"_{0}".format(taxa_count[taxa])
            unenriched_tax_data.set_value(index, column,taxa)
    return

def prepare_enriched_otus():
    return

def prepare_enriched_otu_hits(unenriched_tax_data,threshold,contaminants):
    #First, remove all rows with zero hits
    unenriched_tax_data=unenriched_tax_data.ix[unenriched_tax_data.apply(non_zero_row,axis=1),:]
    
    ID,taxa_col=unenriched_tax_data.index.names
    #print unenriched_tax_data.index.get_level_values("Top Blast Hit")
    simplified_tax=unenriched_tax_data.index.get_level_values(taxa_col).str.replace("[gsfocpdk]__(?:;|$|\n)",'')\
    .str.replace(" ","").str.replace("Root;","").str.replace("(d__Bacteria;|d__Archaea;)","").str.replace(";$","").str.replace("k__.*?;","").str.replace(";(.*);",";")

    unenriched_tax_data=unenriched_tax_data.reset_index()
    #print len(simplified_tax)
    #print len(unenriched_tax_data)
    #print unenriched_tax_data["Top Blast Hit"]
    unenriched_tax_data[taxa_col]=simplified_tax
    
    #For duplicate ids add an integer to make it unique
    make_uniq_id(unenriched_tax_data,taxa_col)
    del unenriched_tax_data[ID]
    unenriched_tax_data=unenriched_tax_data.set_index(keys=[taxa_col])
    
    #Remove contaminants
    unenriched_tax_data=unenriched_tax_data.iloc[~unenriched_tax_data.index.str.contains("|".join(contaminants))]
    #Normalise the table
    unenriched_tax_data=unenriched_tax_data.divide(unenriched_tax_data.sum(axis=0)/100, axis='columns')
    #print enenriched_tax_data.sum(axis=0)
    unenriched_tax_data=unenriched_tax_data.round(decimals=2)
    unenriched_tax_data=remove_less_than(unenriched_tax_data,threshold)
    
    existing_titles=unenriched_tax_data.columns#[u'PL.Pre.Lysis.1', u'PL.Pre.Lysis.2', u'PL.Pre.Lysis.3']
    
    #new_titles={existing_title:"P. lutea-{0}".format(existing_title.split(".")[-1]) for existing_title in existing_titles}
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=1)
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=0)
    #unenriched_tax_data=unenriched_tax_data.rename(columns=new_titles)
    return unenriched_tax_data

def prep_n_merge_enriched_otu_hits(list_of_enriched_data,threshold,contaminants):
    for i,unenriched_tax_data in enumerate(list_of_enriched_data):
        unenriched_tax_data=unenriched_tax_data.ix[unenriched_tax_data.apply(non_zero_row,axis=1),:]

        ID,taxa_col=unenriched_tax_data.index.names
        #print unenriched_tax_data.index.get_level_values("Top Blast Hit")
        simplified_tax=unenriched_tax_data.index.get_level_values(taxa_col).str.replace("[gsfocpdk]__(?:;|$|\n)",'')\
        .str.replace(" ","").str.replace("Root;","").str.replace("(d__Bacteria;|d__Archaea;)","").str.replace(";$","").str.replace("k__.*?;","").str.replace(";(.*);",";")

        unenriched_tax_data=unenriched_tax_data.reset_index()
        #print len(simplified_tax)
        #print len(unenriched_tax_data)
        #print unenriched_tax_data["Top Blast Hit"]
        unenriched_tax_data[taxa_col]=simplified_tax

        #For duplicate ids add an integer to make it unique
        make_uniq_id(unenriched_tax_data,taxa_col)
        del unenriched_tax_data[ID]
        unenriched_tax_data=unenriched_tax_data.set_index(keys=[taxa_col])

        #Remove contaminants
        unenriched_tax_data=unenriched_tax_data.iloc[~unenriched_tax_data.index.str.contains("|".join(contaminants))]
        list_of_enriched_data[i]=unenriched_tax_data
        
    unenriched_single_data=pd.DataFrame([])
    for single_df in list_of_enriched_data:
        unenriched_single_data=unenriched_single_data.join(single_df,how="outer").fillna(0)
    unenriched_tax_data=unenriched_single_data
        #Normalise the table
    unenriched_tax_data=unenriched_tax_data.divide(unenriched_tax_data.sum(axis=0)/100, axis='columns')
    unenriched_tax_data=unenriched_tax_data.round(decimals=2)
    unenriched_tax_data=remove_less_than(unenriched_tax_data,threshold)
    
    existing_titles=unenriched_tax_data.columns#[u'PL.Pre.Lysis.1', u'PL.Pre.Lysis.2', u'PL.Pre.Lysis.3']
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=1)
    unenriched_tax_data=unenriched_tax_data.sort_index(axis=0)
    #new_titles={existing_title:"P. lutea-{0}".format(existing_title.split(".")[-1]) for existing_title in existing_titles}
    
    #unenriched_tax_data=unenriched_tax_data.rename(columns=new_titles)
    return unenriched_tax_data

def clean_column_names_remove_crap(merged_df,key_file,removal_pattern,threshold):
    coral_key=pd.read_csv(key_file,sep="\t")
    column_mapping={}
    for index,row in coral_key.iterrows():
        column_mapping[merged_df.\
                        columns[merged_df.columns.\
                                str.contains(row["Sample_id"])].values[0]]=row["Sample_name"]
    merged_df=merged_df.rename(columns=column_mapping)
    merged_df=merged_df.iloc[:,~merged_df.columns.str.contains(removal_pattern)]
    merged_df=remove_less_than(merged_df,threshold)
    return merged_df
    

In [9]:
ramciotti_data_file=os.path.join(*[core,g_drive,"Honours_Final_report","Relevant_data","all_ramciotti_combined_count_table.txt"])
ramciotti_data=pd.read_csv(ramciotti_data_file,sep="\t",index_col=[0,-1])

plutea_data_file=os.path.join(*[core,g_drive,"Honours_Final_report","Relevant_data","13206to13217_graftm_combined_otu_table.csv"])
plutea_data=pd.read_csv(plutea_data_file,sep="\t",index_col=[0,-1])

unenriched_tax_file=os.path.join(*[core,g_drive,"Honours_Final_report","Relevant_data","926_1392_SeaquenceAllCorals_DNAExtractionTest_16Samplicons_forheatmap.txt"])
unenriched_tax_data=pd.read_csv(unenriched_tax_file, sep="\t",index_col=[0,-1])

cleaned_ramciotti_data=prepare_enriched_otu_hits(ramciotti_data,1,["o__Rickettsiales","k__Eukarya","p__Chlorobi","p__Tenericutes"])
cleaned_plutea_data=prepare_enriched_otu_hits(plutea_data,1,["o__Rickettsiales","k__Eukarya","p__Chlorobi","p__Tenericutes"])

unenriched_tax_data=prepare_unenriched_otu_hits(unenriched_tax_data,1)
#print unenriched_tax_data.shape
#print unenriched_tax_data.index
#print set(cleaned_plutea_data.index) & set(cleaned_ramciotti_data.index)
#print results.index
results=prep_n_merge_enriched_otu_hits([ramciotti_data,plutea_data],1,["o__Rickettsiales","k__Eukarya","p__Chlorobi","p__Tenericutes"])
#print results.loc['p__Chloroflexi;c__Anaerolineae_1',:]

#cleaned_plutea_data.merge(,how="outer")
coral_key_file=os.path.join(*[core,g_drive,"Honours_Final_report","Relevant_data","coral_key.txt"])
removal_pattern="(Porites|Gate)"
merged_metagenome_data=clean_column_names_remove_crap(results,coral_key_file,removal_pattern,1)



In [93]:
#print merged_metagenome_data.shape

#merged_metagenome_data=remove_less_than(merged_metagenome_data,1)
 
print merged_metagenome_data.shape

(61, 19)


In [117]:
fname_core="taxonomic_heatmap"
taxa_type="graftm_enriched"
ticks=[0,0.5,1,2,3,4,5,10,15,20,25]
plot_discrete_heatmap(merged_metagenome_data,ticks,"",plots_dir,taxa_type,fname_core)

NameError: name 'merged_metagenome_data' is not defined

In [99]:
fname_core="taxonomic_heatmap"
taxa_type="16S_unenriched"
ticks=[0,0.5,1,2,3,4,5,10,15,20,25]
plot_discrete_heatmap(unenriched_tax_data,ticks,"",plots_dir,taxa_type,fname_core)

Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.eps
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.svg
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.pdf
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.png


# Abundance parsing

In [122]:
contig_coverage_file=os.path.join(*[core,g_drive,"parse_coverage","bin_contig_sep_coverages.tsv"])

total_read_counts=os.path.join(*[core,g_drive,"parse_coverage","final_bin_read_counts.tsv"])

key_file=os.path.join(*[core,g_drive,"Honours_Final_report","contig_file_type.txt"])

def load_contig_values(contig_cov_file):
    df=pd.read_csv(contig_cov_file,index_col=[0,1],sep="\t")
    return df

def load_total_counts_file(tot_read_counts):
    df=pd.read_csv(tot_read_counts,index_col=[0],sep="\t")
    return df

def load_type_key(key_file):
    df=pd.read_csv(key_file,sep="\t",index_col=[0])
    return df

def calculate_coverage(ctg_file):
    '''Calculates the length normalised average coverage for each bin the average.'''
    length_column=ctg_file[[ctg_file.columns[0]]*11]
    ctg_file[ctg_file.columns[1:]]=ctg_file[ctg_file.columns[1:]]*length_column.values
    total_file=ctg_file.groupby(level=[0]).sum()
    coverage_values=total_file
    coverage_values=coverage_values[coverage_values.columns[1:]]/coverage_values[["Length"]*11].values
    coverage_values=coverage_values.div(coverage_values.sum(axis=0),axis=1)
    coverage_values.index=coverage_values.index.str.extract("(U_[0-9]{5})")
    coverage_values=coverage_values.rename(index=uniq_genome_taxonomy)
    coverage_values.columns=coverage_values.columns.str.extract("([^/]+\.bam)")
    coverage_values=coverage_values.rename(columns=type_key["Sample.1"])
    coverage_values=coverage_values.groupby(level=0,axis=1).mean()
    return coverage_values


ctg_file=load_contig_values(contig_coverage_file)

cts_file=load_total_counts_file(total_read_counts)

type_key=load_type_key(key_file)
    
merged_info=pd.DataFrame.join(cts_file,type_key)

length_column=ctg_file[[ctg_file.columns[0]]*11]
ctg_file[ctg_file.columns[1:]]=ctg_file[ctg_file.columns[1:]]*length_column.values
total_file=ctg_file.groupby(level=[0]).sum()
coverage_values=total_file
coverage_values=coverage_values[coverage_values.columns[1:]]/coverage_values[["Length"]*11].values
coverage_values=coverage_values.div(coverage_values.sum(axis=0),axis=1)
coverage_values.index=coverage_values.index.str.extract("(U_[0-9]{5})")
coverage_values=coverage_values.rename(index=uniq_genome_taxonomy)
coverage_values.columns=coverage_values.columns.str.extract("([^/]+\.bam)")
coverage_values=coverage_values.rename(columns=type_key["Sample.1"])
coverage_values=coverage_values.groupby(level=0,axis=1).mean()



In [123]:
coverage_values

Unnamed: 0_level_0,P.lutea_1,P.lutea_2,P.lutea_3
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rhodospirillales_1,0.0038,0.001294,0.010584
Poribacteria_1,0.015171,0.001941,0.020469
Syntrophobacteraceae_1,0.012778,0.001142,0.017976
Gemm-2_1,0.009072,0.000993,0.010033
PAUC26f_1,0.012849,0.001167,0.013751
Endozoicomonadaceae_1,0.009785,0.816135,0.006757
Acidobacteria_1,0.014266,0.001046,0.021222
Gemm-2_2,0.016072,0.001473,0.02395
Poribacteria_2,0.010642,0.003923,0.023129
Rhodothermaceae_3,0.00333,0.000924,0.009902


In [143]:
fname_core="taxonomic_heatmap"
taxa_type="genome_bins_mapped"
ticks=[0,0.5,1,2,3,4,5,10,15,20,25]
plot_discrete_heatmap(coverage_values*100,ticks,"",plots_dir,taxa_type,fname_core)

Saving the file C:\Users\Alex\Google Drive\Honours\metabolic_analysis\Plots\genome_bins_mapped_taxonomic_heatmap.eps
Saving the file C:\Users\Alex\Google Drive\Honours\metabolic_analysis\Plots\genome_bins_mapped_taxonomic_heatmap.svg
Saving the file C:\Users\Alex\Google Drive\Honours\metabolic_analysis\Plots\genome_bins_mapped_taxonomic_heatmap.pdf
Saving the file C:\Users\Alex\Google Drive\Honours\metabolic_analysis\Plots\genome_bins_mapped_taxonomic_heatmap.png


In [10]:
fname_core="taxonomic_heatmap"
taxa_type="16S_unenriched"
ticks=[0,0.5,1,2,3,4,5,10,15,20,25]
plot_discrete_heatmap(unenriched_tax_data,ticks,"",plots_dir,taxa_type,fname_core)

Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.eps
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.svg
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.pdf
Saving the file C:\Users\Baker\Google Drive\Honours\metabolic_analysis\Plots\16S_unenriched_taxonomic_heatmap.png
