### This script finds Y, or W specific tags that are present only in the heterogametic sex

#### Inputs:
    1. catalog.tags file path
    2. Ustacks outs file path
    3. pop_map file path (This file should include sex information for each sample).

In [1]:
import sys
import os
import gzip
import linecache
import pprint as pp
import math


In [25]:
# Error and usage

Usage_message = "\n##USAGE (args in this order):\nSL_snp_finder.py <path/to/catalog.tags.tsv> <path/to/Ustacks_outputs> <path/to/pop_map.txt> <per_sex_presence_cutoff>\n\n##All paths should be absolute, not relative\n\n"

if len(sys.argv) == 1:
    sys.exit(Usage_message)

elif len(sys.argv) < 5: ## If not enough args are supplied print error message
    sys.exit("\n##Error, not enough arguments\n"+Usage_message)

### Cline args for troubleshooting

catalog_tags_file = "/home/djeffrie/Data/Pperezi/Stacks_outs/batch_1.catalog.tags.tsv.gz"
U_outs_path = "/home/djeffrie/Data/Pperezi/Stacks_outs/"
popmappath = "/home/djeffrie/Data/Pperezi/Stacks_outs/pop_map_kept.txt"
sex_presence_thresh = 0.5

log = open("%s/%s" % (catalog_tags_file.rpartition("/")[0], "Presence_absence.log"), 'w')     
log.write("Input parameters:\n\nCatalog file: %s\nUstacks ouptuts: %s\nPop_map_file: %s\nMinimum percentage of M of F present: %1.2f\n" % (catalog_tags_file, U_outs_path, popmappath, sex_presence_thresh))


In [47]:
catalog_tags_file = sys.argv[1]
U_outs_path = sys.argv[2]
popmappath = sys.argv[3]
sex_presence_thresh = float(sys.argv[4])

## open log file and write params
log = open("%s/%s" % (catalog_tags_file.rpartition("/")[0], "Presence_absence.log"), 'w')     
log.write("Input parameters:\n\nCatalog file: %s\nUstacks ouptuts: %s\nPop_map_file: %s\nMinimum percentage of M of F present: %1.2f\n\n" % (catalog_tags_file, U_outs_path, popmappath, sex_presence_thresh))


ValueError: could not convert string to float: /home/djeffrie/.ipython/profile_default

## ipynb input params
catalog_tags_file = "/home/djeffrie/Data/Caspers_data/Stacks_final_outs/batch_1.catalog.tags.tsv.gz"
U_outs_path = "/home/djeffrie/Data/Caspers_data/Stacks_final_outs/"
popmappath = "/home/djeffrie/Data/Caspers_data/populations/r_075_allsamples/popmap.txt"
sex_presence_thresh = 0.75


In [25]:

## open log file and write params
log = open("%s/%s" % (catalog_tags_file.rpartition("/")[0], "Presence_absence.log"), 'w')     
log.write("Input parameters:\n\nCatalog file: %s\nUstacks ouptuts: %s\nPop_map_file: %s\nMinimum percentage of M of F present: %1.2f\n\n" % (catalog_tags_file, U_outs_path, popmappath, sex_presence_thresh))


## First get sample ID and sex info for each sample
## Include only samples in the pop_map file

sex_file = open(popmappath, 'r').readlines()

kept_sample_names = []

for line in sex_file:
    name = line.split()[0]
    print name.split(".")[0]
    kept_sample_names.append(name.split(".")[0])

sample_dict = {}
sex_dict = {}
sex_dict["Parths"] = []
sex_dict["Sexuals"] = []

parth_count = 0
sex_count = 0

for root, dirs, files in os.walk(U_outs_path):
    for fil in files:
        sex_sys = None
        if "tags.tsv" in fil and "catalog" not in fil:
            sample_name = fil.split(".")[0]
            print sample_name
            
            if sample_name in kept_sample_names:
                
                sample_dict[sample_name] = []
            
                if fil.endswith("gz"):
                    tagsfile = gzip.open("%s/%s" % (root, fil), 'r')
                    tagsfile.readline() ## pass over first line
                    ID = tagsfile.readline().split()[1]
                    sample_dict[sample_name].append(ID)
                else:
                    tagsfile = open("%s/%s" % (root, fil), 'r')
                    tagsfile.readline() ## pass over first line
                    ID = tagsfile.readline().split()[1]
                    sample_dict[sample_name].append(ID)
                    
                for sample in sex_file:
                    if sample_name == sample.split()[0].split(".")[0]:
                        sex_sys = sample.split()[1]
                
                if sex_sys == "parth":
                    sample_dict[sample_name].append(sex_sys)
                    sex_dict["Parths"].append(ID)
                    parth_count += 1
                elif sex_sys == "sex":
                    sample_dict[sample_name].append(sex_sys)
                    sex_dict["Sexuals"].append(ID)
                    sex_count += 1
            else:
                log.write("Sample %s in stacks outputs but not in pop_map file\n" % (sample_name))

               
log.write("\n\n############################\n\nLOG:\n\n# 'BAD' tag status means that one or more samples had two or more tags at the same \ncatalog locus, thus, this locus was not used in analyses\n\n")
log.write("Tag_status\tTag_ID\tN_parths\tN_sexuals\tParth_linked\tSexual_linked\n") ## headers for log info

print sex_dict

C200_P
C202_P
C204_S
C206_S
C209_P
C210_S
C212_S
C213_S
C218A_S
C218B_S
C220_S
C222_S
C223_P
C224_P
C225_S
C226_P
C227_P
C228_S
C232_P
C233_S
C235_P
C236_S
C237_S
C239_S
C240_S
C242_S
C243_S
C246_P
C247_P
C248_S
C249_S
C250_S
C254_S
C256_P
C257_S
C259_S
C262_P
C266_P
C268_P
C271_S
C272_S
C276_S
C277_P
C279_P
C281_P
C282_P
C284_P
C288_S
C291_S
C292_S
C293_P
C294_S
C295_S
C297_S
C299_P
C400_S
C401_P
C403_S
C406_S
C408_S
C410_S
C412_S
C415_S
C416_S
C422_S
C424_P
C425_S
C428_P
C429_P
C432_S
C434_S
C437_S
C441_P
C455_P
C463_P
C480_P
C491_P
C495_P
C500_S
C514_S
C515_S
C517_P
C518_P
C401_P
C288_S
C259_S
C228_S
C415_S
C463_P
C480_P
C518_P
C248_S
C210_S
C256_P
C424_P
C437_S
C277_P
{'Parths': ['3', '12', '9', '7', '11', '14', '2'], 'Sexuals': ['6', '4', '1', '5', '13', '8', '10']}


####Now look through each tag in catalog file:
    find tags which are present in sex_presence_thresh males and no females (Y linked)
    find tags which are present in sex_presence_thresh females and no males (W linked)

In [26]:
if catalog_tags_file.endswith("gz"):
    cat_file = gzip.open(catalog_tags_file, 'r').readlines()
else:
    cat_file = open(catalog_tags_file, 'r').readlines()

n_parths_required = math.ceil(parth_count*sex_presence_thresh) ## math.ceil rounds up to nearest whole individual
n_sexuals_required = math.ceil(sex_count*sex_presence_thresh)

print n_parths_required
print n_sexuals_required

putative_Parth_linked_tags = []
putative_Sex_linked_tags = []


for tag in cat_file[1:]:
    
    ## First get all the info I need
    
    N_parths_at_locus = 0 
    N_sexuals_at_locus = 0
    duplicates_present = "no" ## Checking for replicate samples (Often a tag can be present in the same individual twice, need to remove these)
    samples_at_locus = [] ## for checking for replicate samples
    
    tag_ID =  tag.split()[2]
    samples_field = tag.split()[7] ## the field in the catalog file that contains the samples present at that locus
    
    Parth_linked_str = "NO" ## defaults for log file
    Sex_linked_str = "NO"
    tag_status = "BAD"
    
    for sample in samples_field.split(","):
        
        sample_ID = sample.split("_")[0]
        
        if sample_ID not in samples_at_locus: ## check that there are no replicates
            if sample_ID in sex_dict["Parths"]:
                N_parths_at_locus += 1
                samples_at_locus.append(sample_ID)
            elif sample_ID in sex_dict["Sexuals"]:
                N_sexuals_at_locus += 1
                samples_at_locus.append(sample_ID)
        
        elif sample_ID in samples_at_locus:
            duplicates_present = "yes"
            
    if duplicates_present == "no": ## If there are no sample duplicates at the tag
        
        tag_status = "OK"
        
        ## Look for Y linked tags
        
        if N_sexuals_at_locus == 0 and N_parths_at_locus > n_parths_required:
            seq = tag.split()[8]
            putative_Parth_linked_tags.append((tag_ID, N_sexuals_at_locus, N_parths_at_locus, seq))
            Parth_linked_str = "YES"
            #print "Y-LINKED: Tag_ID=", tag_ID, "N_males=", N_males_at_locus, "N_females=", N_females_at_locus, seq
        
        ## look for W linked tags
        
        elif N_parths_at_locus == 0 and N_sexuals_at_locus > n_sexuals_required:
            seq = tag.split()[8]
            putative_Sex_linked_tags.append((tag_ID, N_sexuals_at_locus, N_parths_at_locus, seq))
            Sex_linked_str = "YES"
            #print "W-LINKED: Tag_ID=", tag_ID, "N_males=", N_males_at_locus, "N_females=", N_females_at_locus, seq
            
    log.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (tag_status, tag_ID, N_parths_at_locus, N_sexuals_at_locus, Parth_linked_str, Sex_linked_str))
    
              
if len(putative_Parth_linked_tags) > 0:
    Parth_fa = open("%s/%s" % (catalog_tags_file.rpartition("/")[0], "Parth_linked_tags.fa"), 'w')
    for line in putative_Parth_linked_tags:
        Parth_fa.write(">Parth_linked|LocusID_%s|N_Sexuals_%s|N_Parthenogens_%s\n%s\n" % (line[0], line[1], line[2], line[3]))
    Parth_fa.close()
              
if len(putative_Sex_linked_tags) > 0:
    Sexuals_fa = open("%s/%s" % (catalog_tags_file.rpartition("/")[0], "Sex_linked_tags.fa"), 'w')
    for line in putative_Sex_linked_tags:
        Sexuals_fa.write(">Sexual_linked|LocusID_%s|N_Sexuals_%s|N_Parthenogens_%s\n%s\n" % (line[0], line[1], line[2], line[3]))
    Sexuals_fa.close()
    
## Write log file summary

log.write("\nSUMMARY:\nNumber of sexuals: %s\n" % (sex_count))
log.write("Number of parthenogens: %s\n" % (parth_count))
log.write("Number of Putative Parthenogenesis linked tags: %s\n" % (len(putative_Parth_linked_tags)))
log.write("Number of Putative Sexuality linked tags: %s\n" % (len(putative_Sex_linked_tags)))

## Print summary to STDOUT

print "\nSUMMARY:\nNumber of sexuals: %s\n" % (sex_count)
print "Number of parthenogens: %s" % (parth_count)
print "Number of Putative Parthenogenesis linked tags: %s" % (len(putative_Parth_linked_tags))
print "Number of Putative Sexuality linked tags: %s" % (len(putative_Sex_linked_tags))
              
log.close()
print "\n ### DONE! ###\n"
    

6.0
6.0

SUMMARY:
Number of sexuals: 7

Number of parthenogens: 7
Number of Putative Parthenogenesis linked tags: 0
Number of Putative Sexuality linked tags: 0

 ### DONE! ###

