In [110]:
def Ustacks_summariser(logfile_path, logfile_suffix, ustacks_path):
    
    """
    \nWill summarise the log file from ustacks pulling out:
        
        Total_reads (reads in fastq file)
        Initialised_reads (reads used to make initial stacks)
        Reads_removed (due to repetitive stacks or blacklisting)
        Final_reads (in final ustacks outputs)
    
    USAGE:
        
        Ustacks_summariser.py </full/path/to/dir/containing/logfiles> <logfile suffix> </full/path/to/dir/containing/ustacks.tags.tsv.gz files>
            
    """

    import os
    import gzip
    
    ## set up outfile
    
    outpath = "%s/Ustacks_summary.out" % logfile_path    
    outfile = open(outpath, 'w')
    outfile.write("Sample_name\tTotal_reads\tReads_used\tLumberjack_reads\n")
    
    ## Get logfiles
    
    logfiles = os.listdir(logfile_path)
    logfiles_processed = 0
    data_dict = {}
    
    print "Processing logfiles . . ."
    
    for logfile in logfiles:
        
        if logfile.endswith(logfile_suffix):
            
            logfile_handle = open("%s/%s" % (logfile_path, logfile), 'r').readlines()
            logfiles_processed += 1
            
            for line in logfile_handle:

                if line.startswith("Parsing"):
                    sample_name = line.strip().rpartition("/")[2].split(".")[0]
                    data_dict[sample_name] = {}

                elif line.startswith("Loaded"):
                    data_dict[sample_name]["total_reads"] = int(line.split()[1])

                elif "Inserted" in line:
                    data_dict[sample_name]["initialised_reads"] = int(line.split()[1])

                #elif "initial stacks were populated" in line:
                #    initial_stacks = line.split()[0]

                #elif "Removed" in line:
                #    removed_stacks = int(line.split()[1])

                #elif "blacklisted" in line:
                #    blacklisted_stacks = int(line.rpartition(" ")[0].rpartition(" ")[2])

                #elif "remainder sequences left to merge" in line:
                #    remained_reads = line.split()[0]

                #elif "Matched" in line:
                #    matched_remainders = int(line.split()[1])

                elif "Number of utilized reads" in line:
                    data_dict[sample_name]["total_reads_used"] = int(line.strip().rpartition(" ")[2])
   
    print "Processed %s logfiles" % logfiles_processed
    
    
    ## Get ustacks files
    
    ustacks_files = os.listdir(ustacks_path)
    
    ustacks_files_processed = 0
    
    print "Processing ustacks files . . . "
    
    for ustacks_file in ustacks_files:
        
        if "tags.tsv" in ustacks_file:
            
            if ustacks_file.endswith("gz"):
                ustacks_handle = gzip.open("%s/%s" % (ustacks_path, ustacks_file), 'r').readlines()
                
            else:
                ustacks_handle = open("%s/%s" % (ustacks_path, ustacks_file), 'r').readlines()
                
            sample_name = ustacks_file.split(".")[0]

            lumberjack = False
            lumberjack_reads = 0

            for line in ustacks_handle:

                if "consensus" in line:

                    if line.split()[9] == "1":
                        lumberjack = True
                    else:
                        lumberjack = False

                elif "primary" in line or "secondary" in line:
                    if lumberjack == True:
                        lumberjack_reads += 1
            
            
            data_dict[sample_name]["lumberjack_reads"] = lumberjack_reads
            ustacks_files_processed += 1
            
    print "Processed %s Ustacks files" % ustacks_files_processed
    usable_reads = data_dict[sample_name]["total_reads_used"] - data_dict[sample_name]["lumberjack_reads"]
            
    for sample in data_dict:
        outfile.write("%s\t%s\t%s\t%s\n" % (sample,data_dict[sample]["total_reads"],usable_reads,data_dict[sample]["lumberjack_reads"]))
        
    print "Finished, output is here: %s" % outpath
    
    outfile.close()
    


In [111]:
log_path="/home/djeffrie/Data/Merc_crosses/"
suffix = "err"
ustackspath = "/home/djeffrie/Data/Merc_crosses"
Ustacks_summariser(log_path, suffix, ustackspath)


Processing logfiles . . .
Processed 1 logfiles
Processing ustacks files . . . 
Processed 1 Ustacks files
Finished, output is here: /home/djeffrie/Data/Merc_crosses//Ustacks_summary.out


In [None]:
### CLINE

import sys

if len(sys.argv) < 4:
    print "\nERROR, not enough arguments"
    sys.exit(Ustacks_summariser.__doc__)
    
elif len(sys.argv) > 4:
    print "\nERROR, too many arguments"
    sys.exit(Ustacks_summariser.__doc__)
    
else:
    Ustacks_summariser(sys.argv[1], sys.argv[2])
    
