## Define helper functions
### Biopython style `UstacksCommandLine` class

In [6]:
import numpy as np 
import collections
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as py
import os
import csv
from collections import Counter
import re
import subprocess

In [2]:
from Bio.Application import _Option, AbstractCommandline, _Switch 

class UstacksCommandLine(AbstractCommandline):
    def __init__(self, cmd="ustacks", **kwargs):
        """
        # Any ustacks option and switch
        >>> cline_args = {'t': 'fastq',
        ...               'f': '../../dan_data/OBY01_1.fil.fq_1',
        ...               'o': '../../dan_data/OBY01_1.fi/M_tests/M_1',
        ...               'i': 1,
        ...               'm': 6,
        ...               'M': 1,
        ...               'p': 3,
        ...               'r': True,
        ...               'd': True}
        
        # This creates a command line object
        >>> ustacks_cline = UstacksCommandLine(**cline_args)
        
        # This prints a string representation
        >>> print ustacks_cline
        ustacks -t fastq -f ../../dan_data/OBY01_1.fil.fq_1 -o ../../dan_data/OBY01_1.fi/M_tests/M_1 -i 1 -m 6 -M 1 -p 3 -r -d
        
        # This runs ustacks
        >>> stdout, stderr = ustacks_cline()
        
        # This prints the first line in the allele files of the example reuslts
        >>> with open('../../dan_data/OBY01_1.fi/M_tests/M_1/OBY01_1.fil.alleles.tsv') as alleles:
        ...     print alleles.readlines()[1].rstrip().split()
        ['0', '1', '1', 'C', '78.04', '167']
        """
        
        self.parameters = [
            _Option(["-t", "t"],
                    "input file Type. Supported types: fasta, fastq, gzfasta, or gzfastq",
                    is_required=True,
                    checker_function=lambda value : value in ["fasta", 
                                                              "fastq",
                                                              "gzfasta",
                                                              "gzfastq"],
                    equate=False),
            _Option(["-f", "f"],
                    "input file path",
                    is_required=True,
                    filename=True,
                    equate=False),
            _Option(["-o", "o"],
                    "output path to write results",
                    is_required=False,
                    filename=True,
                    equate=False),
            _Option(["-i", "i"],
                    "SQL ID to insert into the output to identify this sample",
                    is_required=False,
                    equate=False),
            _Option(["-m", "m"],
                    "Minimum depth of coverage required to create a stack (default 3)",
                    is_required=False,
                    checker_function=lambda value : isinstance(value,int),
                    equate=False),
            _Option(["-M", "M"],
                    "Maximum distance (in nucleotides) allowed between stacks (default 2)",
                    is_required=False,
                    checker_function=lambda value : isinstance(value,int),
                    equate=False),
            _Option(["-N", "N"],
                    "Maximum distance allowed to align secondary reads to primary stacks (default: M + 2)",
                    is_required=False,
                    checker_function=lambda value : isinstance(value,int),
                    equate=False),
            _Switch(["-R", "R"],
                    "retain unused reads"), 
            _Switch(["-H", "H"],
                    "disable calling haplotypes from secondary reads"),
            _Option(["-p", "p"],
                    "enable parallel execution with num_threads threads",
                    is_required=False,
                    checker_function=lambda value : isinstance(value,int),
                    equate=False),  
            _Switch(["-h", "h"],
                    "display this help messsage"),
            
            # Stack assembly options:
            _Switch(["-r", "r"],
                    "enable the Removal algorithm, to drop highly-repetitive stacks (and nearby errors) from the algorithm"), 
            _Switch(["-d", "d"],
                    "enable the Deleveraging algorithm, used for resolving over merged tags"),
            _Option(["--max_locus_stacks", "-max_locus_stacks","max_locus_stacks"],
                    "maximum number of stacks at a single de novo locus (default 3)",
                    is_required=False,
                    checker_function=lambda value : isinstance(value,int),
                    equate=False),
            
            # Model options:
            _Option(["--model_type", "-model_type","model_type"],
                    "either 'snp' (default), 'bounded', or 'fixed'",
                    is_required=False,
                    checker_function=lambda value : value in ["snp", 
                                                              "bounded",
                                                              "fixed"],
                    equate=False),
            
            # For the SNP or Bounded SNP model: 
            _Option(["--alpha", "-alpha","alpha"],
                    "chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001",
                    is_required=False,
                    checker_function=lambda value : value in [0.1, 
                                                              0.05,
                                                              0.01,
                                                              0.001],
                    equate=False),
            
            
            # For the Bounded SNP model:
            _Option(["--bound_low", "-bound_low","bound_low"],
                    "lower bound for epsilon, the error rate, between 0 and 1.0 (default 0)",
                    is_required=False,
                    checker_function=lambda value : 0 <= value <= 1,
                    equate=False),
            _Option(["--bound_high", "-bound_high","bound_high"],
                    "upper bound for epsilon, the error rate, between 0 and 1.0 (default 1)",
                    is_required=False,
                    checker_function=lambda value : 0 <= value <= 1,
                    equate=False),
            
            # For the Fixed model:
            _Option(["--bc_err_freq", "-bc_err_freq","bc_err_freq"],
                    "specify the barcode error frequency, between 0 and 1.0",
                    is_required=False,
                    checker_function=lambda value : 0 <= value <= 1,
                    equate=False)
                           ]
        AbstractCommandline.__init__(self, cmd, **kwargs) 

## With this uncommented, the examples in the 
## docstring will be run and the outputs compared
#if __name__ == "__main__":
#    import doctest
#    doctest.testmod()

## Define the incrementation and helper functions

In [3]:
stop_value = 2

stop_value = stop_value+1
    

print range(1, stop_value, 1)
print stop_value

[1, 2]
3


In [12]:
def replace_all(text,dic): ## this function takes the string to be searched and a dictionary of replacements
    for i, j in dic.iteritems(): #iteritems needed when using dic entries - remember this
        text = text.replace(i,j) ## for each dictionary entry, replace the "key" with the "value" in the text
    return text

## Define a cool little helper function for "natural sorting". (no idea how it works though!)
def natural_key(string_): 
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]


def make_and_run_command_lines(param, param_values, parent_dir_path, sample, file_format, threads, ID):
    
    # fixed values
    default = {'M': 3, 
               'm': 6}
              # MS as stacks default
    
    batch_params = {}
    
    for val in param_values:
        
        # make a directory
        #parent_dir_path + str(sample[:-6])+ '/M_tests/M_x/
        dirname = '%s%s/%s_tests/%s_%i'%(parent_dir_path,str(sample[:-6]),param,param,val)
        if not os.path.exists(dirname):
            try:
                os.makedirs(dirname)
            except:
                os.mkdir(dirname)
            
        # make command line
        args = {}
        args['t'] = file_format
        args['p'] = threads
        args['d'] = True
        args['r'] = True
        args['i'] = str(ID)
        args['f'] = "%s%s"%(parent_dir_path,str(sample))
        args['o'] = dirname
        if param == 'MS':
            args['max_locus_stacks'] = val
            args.update(default)
        else:
            args[param] = val
            for key in default:
                if not key == param:
                    args[key] = default[key]
                    
        cline = UstacksCommandLine(**args)
        print str(cline)
        stderr, stdout = cline()
        batch_params["%s_%i"%(args['i'], val)] = args
        
        
    return batch_params          
                            
        
             


def IncreMental_U(param, start_value, stop_value, increment, parent_dir_path, file_format, threads):
## parent_dir_path - the path to look for raw read .fastq files must have the trailing '/' in it.
    
    
    stop_value = stop_value+1
    
    ##lists and command templates 
    sample_names = [] 
    param_values = range(start_value, stop_value, increment)
    
    ## Define subfunctions ---------------------------------------------------------
            
    ## TAG COUNTER ##         
    def Tag_counter(directory, parameter):
        c = Counter()
        sample_names = []
        tag_data = []
        names = []
    
    
        ## Function to count the length of a file
    
        def file_len(data):
                for i, l in enumerate(data):
                    pass
                return i + 1
    

        ## get file names and paths
        if parameter == 'm':
            for root, dirs, files in os.walk(parent_dir_path):
                for fil in files:
                    if fil.endswith("tags.tsv") and 'catalog' not in fil and root.split('/')[11].startswith('m'):
                        print fil
                        data_file = open(str(root+'/'+fil), 'r')
                        tag_tally = 0
                        for line in data_file.readlines():
                            if 'consensus' in line:
                                tag_tally +=1
                        tag_data.append(str(root.split('/')[-1])+'/'+str(fil)+"\t"+str(tag_tally)) ## works nicely just put this into a list with sample name and its nearly done
                        print data_file
            tag_data = sorted(tag_data, key = natural_key)
            f = open(directory + "m_tests_Tag_numbers.txt", 'w')
            for i in tag_data:
                print i
                f.write(i+"\n")
            f.close()

    
            ## get the sample names
            for i in tag_data:
                names.append(i.split('\t')[0].split('/')[-1])
            names = set(names)
            print names    
            
            ##plot the figures
            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for i in names:
                tag_numbers = []
                for item in tag_data:
                    if i in item:
                        tag_numbers.append(int(item.split('\t')[1]))
                print "TAG COUNTER\n", i, tag_numbers, "stop_value = ", stop_value
                fig.add_subplot(np.round((len(names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),tag_numbers) ## Manually input the same parameter range used in incremental IncreMental here for the x axis labels.
                plt.plot(range(start_value, stop_value, increment),tag_numbers)
                plt.title(i+" Change in number of tags with m incrementation", fontsize = 5)
                plt.xlabel("Ustacks m Value", fontsize = 10)
                plt.ylabel("Number of tags", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
        
                tag_numbers = []
                plot_number+=1
            plt.savefig(directory + "m_tests_Tags_per_sample.pdf")
            plt.close('all')  
            
        elif parameter == 'M':
            for root, dirs, files in os.walk(parent_dir_path):
                for fil in files:
                    if fil.endswith("tags.tsv") and 'catalog' not in fil and root.split('/')[-1].startswith('M'):
                        print fil
                        data_file = open(str(root+'/'+fil), 'r')
                        tag_tally = 0
                        for line in data_file.readlines():
                            if 'consensus' in line:
                                tag_tally +=1
                        tag_data.append(str(root.split('/')[-1])+'/'+str(fil)+"\t"+str(tag_tally)) ## works nicely just put this into a list with sample name and its nearly done
                        print data_file
            tag_data = sorted(tag_data, key = natural_key)
            f = open(directory + "M_tests_Tag_numbers.txt", 'w')
            for i in tag_data:
                print i
                f.write(i+"\n")
            f.close()

    
            ## get the sample names
            for i in tag_data:
                names.append(i.split('\t')[0].split('/')[-1])
            names = set(names)
            print names    
            ##plot the figures
            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for i in names:
                tag_numbers = []
                for item in tag_data:
                    if i in item:
                        tag_numbers.append(int(item.split('\t')[1]))
                print "TAG COUNTER\n", i, tag_numbers
                fig.add_subplot(np.round((len(names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),tag_numbers) ## Manually input the same parameter range used in incremental IncreMental here for the x axis labels.
                plt.plot(range(start_value, stop_value, increment),tag_numbers)
                plt.title(i+" Change in number of tags with M incrementation", fontsize = 5)
                plt.xlabel("Ustacks m Value", fontsize = 10)
                plt.ylabel("Number of tags", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
        
                tag_numbers = []
                plot_number+=1
            plt.savefig(directory + "M_tests_Tags_per_sample.pdf")
            plt.close('all')
            
        ## Max-stacks-per-locus ###
        
        elif parameter == "MS":
            for root, dirs, files in os.walk(parent_dir_path):
                for fil in files:
                    if fil.endswith("tags.tsv") and 'catalog' not in fil and root.split('/')[-1].startswith('MS'):
                        print fil
                        data_file = open(str(root+'/'+fil), 'r')
                        tag_tally = 0
                        for line in data_file.readlines():
                            if 'consensus' in line:
                                tag_tally +=1
                        tag_data.append(str(root.split('/')[-1])+'/'+str(fil)+"\t"+str(tag_tally)) ## works nicely just put this into a list with sample name and its nearly done
                        print data_file
            tag_data = sorted(tag_data, key = natural_key)
            f = open(directory + "MS_tests_Tag_numbers.txt", 'w')
            for i in tag_data:
                print i
                f.write(i+"\n")
            f.close()
            
        ## get the sample names
            for i in tag_data:
                names.append(i.split('\t')[0].split('/')[-1])
            names = set(names)
            print names    
            
        ##plot the figures
            
            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for i in names:
                tag_numbers = []
                for item in tag_data:
                    if i in item:
                        tag_numbers.append(int(item.split('\t')[1]))
                print "TAG COUNTER\n", i, tag_numbers
                fig.add_subplot(np.round((len(names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),tag_numbers) ## Manually input the same parameter range used in incremental IncreMental here for the x axis labels.
                plt.plot(range(start_value, stop_value, increment),tag_numbers)
                plt.title(i+" Change in number of tags with Max_locus_stacks incrementation", fontsize = 5)
                plt.xlabel("Ustacks M_L_S Value", fontsize = 10)
                plt.ylabel("Number of tags", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
        
                tag_numbers = []
                plot_number+=1
            plt.savefig(directory + "MS_tests_Tags_per_sample.pdf")
            plt.close('all')
        
    
    
    ## COVERAGE COUNTER ##
    
    def coverage_counter(file_name):
        csvcol3 = []
        csvfile = open(file_name, 'rb')
        csvread = csv.reader(csvfile, delimiter ="\t") ## read csv in
    
        for line in csvread:
            # outputs in new version of stacks start with a comment 
            if not line[0].startswith('#'):
                csvcol3.append(int(line[2])) ## add the 3 column of each line to a list
        csvcol3 = [str(i) for i in csvcol3] ## convert this list of integers to strings
    
        coverage_count = collections.Counter()
        for tagID in csvcol3:
            coverage_count[tagID] += 1 ## count the number of times each tag ID occurs (i.e. the coverage)
    
        coverage_values = []
        f = open(str(file_name[:-9]+" Coverage data.txt"), 'a') ## make a new txt file for coverage data
    
        for i,j in coverage_count.iteritems():
            if len(str(j)) > 0:
                coverage_values.append(j) ## append the coverage to a list
                f.write(str(j) + '\n') ## write the coverage data to the txt file
    
        plt.hist(coverage_values, bins = max(coverage_values),range = [0, 100])
        plt.hist(coverage_values, bins = max(coverage_values),range = [0, 100])
        plt.title(file_name[2:8]+file_name[27:33])
        plt.xlabel("Coverage")
        plt.ylabel("Frequency")
        plt.text(60, 1200, "Mean tag coverage ="+ str(np.round(np.mean(coverage_values),2)))
        plt.savefig(file_name.rpartition('/')[0]+"/Coverage.pdf")
        plt.close()
    
        f.close()
        print coverage_values[:10]
        
        ## COVERAGE COUNTER LOOPER ## 
        
    def coverage_counter_looper(directory, parameter): ## make sure this is the right parent directory - i.e. contains the sample name folder created by increMental
        tsvs = []
        subdirs = []
        cov_files = []
        
        ### BIG if statement - if the parameter is M or m
    
        if parameter == 'M':
        
        ## Make a list of the files, including their paths from the current directory
        
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith(".tags.tsv") and 'catalog' not in fil and 'M_' in root:
                        tsvs.append(str(str(root)+'/'+str(fil)))
                        subdirs.append(str(str(root)+'/'+str(dirs)))
                        print root, fil
    
            ## Execute coverage counter for all these files
    
            for tsv in tsvs:
                coverage_counter(tsv)           
    
            ## make the multiplot
            ## first get the coverage counter output files
    
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith("data.txt") and 'catalog' not in fil and 'M_' in root:  ## have changed this so hopefully it doesn't pick up increMental_C outputs
                        cov_files.append(str(str(root)+'/'+str(fil)))
            cov_files = sorted(cov_files, key = natural_key)
            plot_number = 1

            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.8)
            data = []
            for cov_file in cov_files:    
                data_file = open(cov_file, 'r')
                for i in data_file.readlines():
                    if i > 0:
                        data.append(int(i))    
    
                fig.add_subplot(np.round((len(cov_files)/3)+1),3,plot_number)
                plt.hist(data, bins = 100,range = [0, 150])
                plt.title(cov_file.split('/')[-1].partition('_')[0]+" "+ cov_file.split('/')[-2], fontsize = 5)
                py.yticks(fontsize = 5)
                py.xticks(fontsize = 5)
                plt.xlabel("Coverage", fontsize = 5)
                plt.ylabel("Frequency", fontsize = 5)
                plt.text(60, 1200, "Mean tag coverage ="+ str(np.round(float(np.mean(data)),2)), fontsize = 5)
    
                plot_number += 1

            plt.savefig(directory + 'M_tests_coverage_multiplot.pdf')    
            plt.close('all')
    
    
            print('number of coverage plots = '+ str(plot_number -1))
    
            ### Make the change-in average coverage plot
    
            sample_coverage = []
            sample_names = []
            cov_data = []
            cov_values = []



            ## get file names and paths

            for root, dirs, files in os.walk(directory):
                for fil in files:
                    if fil.endswith("data.txt") and 'M_' in root:
                        data_file = open(str(root+'/'+fil), 'r')
                        data = [int(i.lstrip('0')) for i in data_file.readlines()]
                        sample_coverage.append(fil.partition('_')[0] + "__" + root.split('/')[3]+"\t"+ str(np.round(np.mean(data),2)))

            ## make a list of uniq sample names

            for i in sample_coverage:
                sample_names.append(i.split('\t')[0].partition('_')[0])
            sample_names = set(sample_names)
            print "sample names =" 
            print sample_names

            ## Use this list to separate the data according to the sample it comes from
            ## And then plot the graphs in one file and save in parent directory
    
            fig = plt.figure() ## make fig
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for name in sample_names: ## for each sample
                for line in sample_coverage:
                    if name in line:
                        cov_data.append(line)
                cov_data = sorted(cov_data, key = natural_key) ## sort "naturally"
                for i in cov_data:
                    cov_values.append(float(i.split()[1]))
                print name
                print cov_values
    
            ## And now plot the graphs as subplots in a main fig. Put in parent dir.   
    
                fig.add_subplot(np.round((len(sample_names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),cov_values) ## Manually input range from IncreMental here!
                plt.plot(range(start_value, stop_value, increment),cov_values)
                plt.title(name+" coverage per tag with M incrementation", fontsize = 10)
                plt.xlabel("Ustacks Parameter Value", fontsize = 10)
                plt.ylabel("Coverage per tag", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
    
                cov_data = []
                cov_values = []
                plot_number+=1
            plt.savefig(directory + "M_tests_mean_coverage_multiplot.pdf")
            plt.close('all')
        
        elif parameter == 'm':
        
        ## Make a list of the files, including their paths from the current directory
        
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith(".tags.tsv") and 'catalog' not in fil and 'm_' in root:
                        tsvs.append(str(str(root)+'/'+str(fil)))
                        print root, fil
    
            ## Execute coverage counter for all these files
    
            for tsv in tsvs:
                coverage_counter(tsv)           
    
            ## make the multiplot
            ## first get the coverage counter output files
    
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith("data.txt") and 'catalog' not in fil and 'm_' in root:  ## have changed this so hopefully it doesn't pick up increMental_C outputs
                        cov_files.append(str(str(root)+'/'+str(fil)))
            cov_files = sorted(cov_files, key = natural_key)          
            plot_number = 1
            
            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.8)

            for cov_file in cov_files:    
                print cov_file
                data_file = open(cov_file, 'r')
                data = [int(i) for i in data_file.readlines()]

                fig.add_subplot(np.round((len(cov_files)/3)+1),3,plot_number)
                plt.hist(data, bins = 100,range = [0, 150])
                plt.title(cov_file.split('/')[-1].partition('_')[0]+" "+ cov_file.split('/')[-3], fontsize = 5)
                py.yticks(fontsize = 5)
                py.xticks(fontsize = 5)
                plt.xlabel("Coverage", fontsize = 5)
                plt.ylabel("Frequency", fontsize = 5)
                plt.text(60, 1200, "Mean tag coverage ="+ str(np.round(float(np.mean(data)),2)), fontsize = 5)
    
                plot_number += 1

            plt.savefig(directory + 'm_tests_coverage_multiplot.pdf')    
            plt.close('all')
    
    
            print('number of coverage plots = '+ str(plot_number -1))
    
            ### Make the change-in average coverage plot
    
            sample_coverage = []
            sample_names = []
            cov_data = []
            cov_values = []


            ## get file names and paths

            for root, dirs, files in os.walk(directory):
                for fil in files:
                    if fil.endswith("data.txt") and 'm_' in root:
                        data_file = open(str(root+'/'+fil), 'r')
                        data = [int(i) for i in data_file.readlines()]
                        sample_coverage.append(fil.partition('_')[0] + "__" + root.split('/')[3]+"\t"+ str(np.round(np.mean(data),2)))

            ## make a list of uniq sample names

            for i in sample_coverage:
                sample_names.append(i.split('\t')[0].partition('_')[0])
            sample_names = set(sample_names)
            print "sample names =" 
            print sample_names

            ## Use this list to separate the data according to the sample it comes from
            ## And then plot the graphs in one file and save in parent directory
    
            fig = plt.figure() ## make fig
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for name in sample_names: ## for each sample
                for line in sample_coverage:
                    if name in line:
                        cov_data.append(line)
                cov_data = sorted(cov_data, key = natural_key) ## sort "naturally"
                for i in cov_data:
                    cov_values.append(float(i.split()[1]))
                print name
                print cov_values
    
            ## And now plot the graphs as subplots in a main fig. Put in parent dir.   
    
                fig.add_subplot(np.round((len(sample_names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),cov_values) ## remember to change back to start_value, stop_value, increment
                plt.plot(range(start_value, stop_value, increment),cov_values)
                plt.title(name+" coverage per tag with m incrementation", fontsize = 10)
                plt.xlabel("Ustacks Parameter Value", fontsize = 10)
                plt.ylabel("Coverage per tag", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
    
                cov_data = []
                cov_values = []
                plot_number+=1
            plt.savefig(directory + "m_tests_mean_coverage_multiplot.pdf")
            plt.close('all')
            
        elif parameter == "max_locus_stacks":
            ## Make a list of the files, including their paths from the current directory
        
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith(".tags.tsv") and 'catalog' not in fil and 'MS_' in root:
                        tsvs.append(str(str(root)+'/'+str(fil)))
                        print root, fil
    
            ## Execute coverage counter for all these files
    
            for tsv in tsvs:
                coverage_counter(tsv)           
    
            ## make the multiplot
            ## first get the coverage counter output files
    
            for root, dirs, files in os.walk(str(directory)):
                for fil in files:
                    if fil.endswith("data.txt") and 'catalog' not in fil and 'MS_' in root:  ## have changed this so hopefully it doesn't pick up increMental_C outputs
                        cov_files.append(str(str(root)+'/'+str(fil)))
            cov_files = sorted(cov_files, key = natural_key)          
            plot_number = 1
            
            fig = plt.figure()
            plt.subplots_adjust(hspace = 0.8)

            for cov_file in cov_files:    
                print cov_file
                data_file = open(cov_file, 'r')
                data = [int(i) for i in data_file.readlines()]

                fig.add_subplot(np.round((len(cov_files)/3)+1),3,plot_number)
                plt.hist(data, bins = 100,range = [0, 150])
                plt.title(cov_file.split('/')[-1].partition('_')[0]+" "+ cov_file.split('/')[-3], fontsize = 5)
                py.yticks(fontsize = 5)
                py.xticks(fontsize = 5)
                plt.xlabel("Coverage", fontsize = 5)
                plt.ylabel("Frequency", fontsize = 5)
                plt.text(60, 1200, "Mean tag coverage ="+ str(np.round(float(np.mean(data)),2)), fontsize = 5)
    
                plot_number += 1

            plt.savefig(directory + 'MS_tests_coverage_multiplot.pdf')    
            plt.close('all')
    
    
            print('number of coverage plots = '+ str(plot_number -1))
        
        ### Make the change-in average coverage plot
    
            sample_coverage = []
            sample_names = []
            cov_data = []
            cov_values = []


            ## get file names and paths

            for root, dirs, files in os.walk(directory):
                for fil in files:
                    if fil.endswith("data.txt") and 'MS_' in root:
                        data_file = open(str(root+'/'+fil), 'r')
                        data = [int(i) for i in data_file.readlines()]
                        sample_coverage.append(fil.partition('_')[0] + "__" + root.split('/')[3]+"\t"+ str(np.round(np.mean(data),2)))

            ## make a list of uniq sample names

            for i in sample_coverage:
                sample_names.append(i.split('\t')[0].partition('_')[0])
            sample_names = set(sample_names)
            print "sample names =" 
            print sample_names

            ## Use this list to separate the data according to the sample it comes from
            ## And then plot the graphs in one file and save in parent directory
    
            fig = plt.figure() ## make fig
            plt.subplots_adjust(hspace = 0.5) ## adjust subplots
            plot_number =1
            for name in sample_names: ## for each sample
                for line in sample_coverage:
                    if name in line:
                        cov_data.append(line)
                cov_data = sorted(cov_data, key = natural_key) ## sort "naturally"
                for i in cov_data:
                    cov_values.append(float(i.split()[1]))
                print name
                print cov_values
    
            ## And now plot the graphs as subplots in a main fig. Put in parent dir.   
    
                fig.add_subplot(np.round((len(sample_names)/2)+1),2,plot_number)
                plt.scatter(range(start_value, stop_value, increment),cov_values) ## remember to change back to start_value, stop_value, increment
                plt.plot(range(start_value, stop_value, increment),cov_values)
                plt.title(name+" coverage per tag with Max_Locus_stacks incrementation", fontsize = 10)
                plt.xlabel("Ustacks M_L_S Parameter Value", fontsize = 10)
                plt.ylabel("Coverage per tag", fontsize = 10)
                plt.xticks(fontsize = 7)
                plt.yticks(fontsize = 7)
    
                cov_data = []
                cov_values = []
                plot_number+=1
            plt.savefig(directory + "MS_tests_mean_coverage_multiplot.pdf")
            plt.close('all')
   
    
    ## RUNNING PIPELINE ---------------------------------------------------------------
    
        
    ID = 1 ## Assign a different ID to each individual with for a given param value
    
    
    ## Get samples
    for i in os.listdir(parent_dir_path):
        if i.endswith(".fq_1") or i.endswith(".fq") or i.endswith("1.fastq"): 
            sample_names.append(i)
    sample_names = sorted(sample_names) ## Important line - as Incremental_C sorts as well and assigns samples to lists on the basis of their IDs they must be sorted the same way here
    
    print "samples present", sample_names
    print ("Parameter values =" + str(param_values))    

    
    analysis_settings = {}
   
    for sample in sample_names: ## Run pipeline for each sample ...
        
        analysis_settings[str(ID)] = make_and_run_command_lines(param, param_values, parent_dir_path,
                                                                sample, file_format, threads, ID)
        ID += 1

    Tag_counter(parent_dir_path, param)
    coverage_counter_looper(parent_dir_path, param)
    
    
    
    return analysis_settings

### Run the pipline here:

In [27]:
Tag_counter("/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/", "M")

NameError: name 'Tag_counter' is not defined

In [14]:
## MS analyses

working_dir = "/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/"

analysis_settings = {}

for root, dirs, files in os.walk(working_dir):
    for dir in dirs:
        if "TESTDIR" in dir:  ## Loop over the test directories
            
            testdir = '%s%s/' %(root, dir)
            print testdir
            analysis_settings[dir] = IncreMental_U("max_locus_stacks", 1, 5, 1, testdir, 'fastq', 7)





/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/
samples present ['HOLT9_RD-P1-180_1.fil.fq_1', 'RM31_RD-P1-171_1.fil.fq_1', 'RM8_RD-P1-175_1.fil.fq_1']
Parameter values =[1, 2, 3, 4, 5]
ustacks -t fastq -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fi/max_locus_stacks_tests/max_locus_stacks_1 -i 1 -m 6 -M 3 -p 7 -r -d --max_locus_stacks 1
ustacks -t fastq -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fi/max_locus_

In [40]:
## Batch analysis
working_dir = "/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/"

test_params = ["M", "m", "MS"]

analysis_settings = {}

for root, dirs, files in os.walk(working_dir):
    for dir in dirs:
        if "TESTDIR" in dir:  ## Loop over the test directories
            testdir = '%s%s/' %(root, dir)
            
            for parameter in test_params: ## loop over the paramters to test . . . 
                
                analysis_settings[dir] = IncreMental_U(parameter, 1, 8, 1, testdir, 'fastq', 7)

            
            
            


samples present ['HOLT9_RD-P1-180_1.fil.fq_1', 'RM31_RD-P1-171_1.fil.fq_1', 'RM8_RD-P1-175_1.fil.fq_1']
Parameter values =[1, 2, 3, 4, 5, 6, 7, 8]
ustacks -t fastq -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fi/M_tests/M_1 -i 1 -m 6 -M 1 -p 7 -r -d
ustacks -t fastq -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTDIR_1/HOLT9_RD-P1-180_1.fi/M_tests/M_2 -i 1 -m 6 -M 2 -p 7 -r -d
ustacks -t fastq -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/CRU_CYP_TESTD

### On the basis of the above results, decide on the Ustacks parameters to use here, and then run cstacks using these for use in Incremental_C

In [16]:
def Final_Ustacks_command(M_value, m_value, parent_dir_path):
       
## Define execute function ##
        
    def execute(bash_file):
        f = open(str(bash_file), 'r')
        script = f.read()
        subprocess.call(script, shell=True)
        print script
    
## Get files ##
    
    sample_names = []
    
    for i in os.listdir(parent_dir_path):
        if i.endswith(".fq_1") or i.endswith("1.fastq"): 
            sample_names.append(i)
    
    print ("Parameter values: M ="+str(M_value)+" m ="+str(m_value)+"\n")
    
## Write file ##

    f = open(str(parent_dir_path +"Final_Ustacks_commands.sh"), 'w') ## .. make a bash script file in the current ipython directory...
    f.write("#!/bin/bash\n\nmkdir "+ parent_dir_path + "Final_Ustacks_outputs/ \n") 
    
    Sample_ID = 1
    for sample in sample_names: ## so for each sample ...
        final_command_list = []
        f.write('ustacks -t fastq -M '+str(M_value)+' -m '+str(m_value)+' -p 8 -d -r -i '+str(Sample_ID)+' -f '+ str(parent_dir_path) + str(sample)+' -o '+ str(parent_dir_path) + 'Final_Ustacks_outputs/ \n')    
        Sample_ID += 1
        print (sample.split("_")[0]+" ID = "+str(Sample_ID))
    f.close()
    
## Execute the scripts
    execute(str(parent_dir_path +"Final_Ustacks_commands.sh"))

In [None]:
Final_Ustacks_command(8,8,'/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_V3/Incremental_tests/Batch_2/all_cru/')

In [23]:
## batch final ustacks

for root, dirs, files in os.walk("/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs/"):
    for dir in dirs:
        if "TESTDIR" in dir and not "CRU_GIB_TESTDIR_3" in dir and not "CRU_CYP_TESTDIR_1" in dir:
            parent_dir = root+"/"+dir
            Final_Ustacks_command(4, 2, str(parent_dir+"/"))

Parameter values: M =4 m =2

EP01 ID = 2
OU01 ID = 3
RM32 ID = 4
#!/bin/bash

mkdir /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs//CRU_CYP_TESTDIR_2/Final_Ustacks_outputs/ 
ustacks -t fastq -M 4 -m 2 -p 8 -d -r -i 1 -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs//CRU_CYP_TESTDIR_2/EP01_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs//CRU_CYP_TESTDIR_2/Final_Ustacks_outputs/ 
ustacks -t fastq -M 4 -m 2 -p 8 -d -r -i 2 -f /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs//CRU_CYP_TESTDIR_2/OU01_1.fil.fq_1 -o /media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/Stacks/Stacks_analyses_TRIMMED/Thesis_Incremental_runs//CRU_CYP_TESTDIR_2/Final_Ustacks_outputs/ 
ustacks -t fastq -M 4 -m 2 -p 8 -d -r -i 3 -f /media/dan/34D5D1CE642D7E36/2013076_Hanflin