In [1]:
from __future__ import division
import vcf
import matplotlib
#matplotlib.use('Agg') ## this allows the drawing of plots in matplotlib on the cluster, which doesn't use the X-server backend. This has something to do with display (but I don't know what)
import matplotlib.pyplot as plt
import numpy as np
import os.path
import sys
import time
import gzip



### This script identifies sex linked markers from a VCF file using the criteria of female-male allele freq 

####Workflow:
    1. Filters loci that are present in the user-specified number of samples
    2. Calculates the allele frequencies for males and females separately
    3. Subtracts male from female frequencies and filter loci that show signs of X or Z linkage
    4. Outputs all male and female frequencies and female-male outputs to a single file called "yourinput.vcf.all_frequencies.tsv" (where yourinput = the name and path of your vcf file). Loci identified as X or Z linked are labelled as such in this file.
    5. Outputs all putative X or Z linked markers to separate fasta files if any are identified.
    6. Outputs a histogram of the distribution of female-male frequencies called "yourinput.vcf.fem-male_freqs.pdf"
    7. All suplus information is recorded to a log file, with a summary at the end of this file. 


In [12]:
femREF_freq = 0.95
male_heterozygosity = 0.61
lower_thresh = 0.4
upper_thresh = 0.6

if all([femREF_freq >= 0.95, male_heterozygosity >= lower_thresh, male_heterozygosity <= upper_thresh]):
    print "YES"
else:
    print "NO"

NO


In [127]:
def SL_snp_finder2(myvcfpath, popmappath, catalog_tags_file, homogamtic_homozygosity_threshold, heterogametic_heterozygosity_threshold, sample_presence_cutoff, coverage_threshold, maf_threshold):
    
   
    """
    This script identifies sex linked markers from a VCF file using the criteria of female-male allele freq 

    Arguments:
    
    myvcfpath                            - path to vcf file (note this will be altered to make header compatible with Pyvcf. 
                                           New vcf will have same name with ".altered" appended to the end)
    
    popmappath                           - path to population map file containing sex information. Same format as Stacks pop map file.
    
    catalog_tags_file                    - The catalog tags file used to create the vcf
    
    homogamtic_homozygosity_threshold    - The lower threshold for the proportion of homozygotes in the homogametic sex at a locus
    
    homogamtic_homozygosity_threshold    - The lower threshold for the proportion of heterozygotes in the heterogametic sex at a locus
        
    sample_presence_cutoff               - a locus must be called in at least this proportion of all samples (not within populations) to be considered
    
    coverage_threshold                   - a locus must have at least this threshold in a sample to be considered for that sample. Note that loci below this 
                                           threshold will be removed from a sample, and this can push the locus below the sample presence cut-off, which will
                                           then remove the locus.
    
    maf_threshold                        - minor allele frequency cutoff for a locus across all samples. 
    
    
    Workflow: ### NOTE*** different to SL_snp_finder()
    
    1. Filters loci that are present in the user-specified number of samples
    2. Calculates the allele frequencies for male-s and females separately
    3. Looks for loci that are heterozygous in all or most of the homogametic sex and homozygous in all or most
       of the heterogametic sex
    4. Outputs all male and female frequencies and female-male outputs to a single file called 
       "yourinput.vcf.all_frequencies.tsv" (where yourinput = the name and path of your vcf file). Loci 
       identified as X or Z linked are labelled as such in this file.
    5. Outputs all putative X or Z linked markers to separate fasta files if any are identified.
    6. Outputs a histogram of the distribution of female-male frequencies called "yourinput.vcf.fem-male_freqs.pdf"
    7. All suplus information is recorded to a log file, with a summary at the end of this file.
    
    
    """
    
    import vcf
    import matplotlib
    #matplotlib.use('Agg') ## this allows the drawing of plots in matplotlib on the cluster, which doesn't use the X-server backend. This has something to do with display (but I don't know what)
    import matplotlib.pyplot as plt
    import numpy as np
    import os.path
    import sys
    import time
    import gzip
    
    
    ## set the window around the freq threshold. The window automatically tightens and relaxes around 0.5 or -0.5 




    # First thing to do is alter the metadata in the vcf outputted by stacks 1.30. I am not sure if it is stacks or pyvcf that is wrong, but stacks encodes the data in the allele depth field as an interger, while pyvcf expects a float. Changing the metadata line in the vcf header to contain "Number=." instead of "Number=1" fixes the issue.

    myvcf = open(myvcfpath, 'r').readlines()
    
    alteredvcfpath = "%s%s" %(myvcfpath, ".altered")
    alteredvcf = open(alteredvcfpath, 'w')

    for line in myvcf:
        if "Allele Depth" not in line:
            alteredvcf.write(line)
        elif "Allele Depth" in line:
            line = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allele Depth">\n'
            alteredvcf.write(line)
    alteredvcf.close()


    # ### Now calculate allele frequencies for males and females at each SNP
    # ####Requires:
    #     1. pyvcf module installed (can use pip, remember to add to python path. This is on the cluster!
    #     2. altered vcf file from above
    #     3. pop_map.txt file. Same format as used for stacks. Sample names must be the same. And males and females must be denoted by M or F (case sensitive) respectively. Must be the same file as used in populations to creat the VCF. If there are additional samples in this file the allele frequencies will be wrong!

    vcf_reader = vcf.Reader(open(alteredvcfpath, 'r')) ## load in altered vcf file


    ## Open file for all frequency and locus info for all kept genotypes
    all_frequencies = open("%s%s" %(myvcfpath, ".all_frequencies.tsv"), 'w')
    all_frequencies.write("#Locus_ID\tPOS\tN_MaleChroms\tN_FemChroms\tMalefreq_REF\tMalefreq_ALT\tFemalefreq_REF\tFemalefreq_ALT\tFemREF-MaleREF\tSex_linked\n")

    ## Open files to output X or Z linked loci to and add headers

    Putative_Xlinked_makers = []

    Putative_Zlinked_makers = []

    ## Make a log file for all auxillary info
    freq_ratios_log = open("%s%s" %(myvcfpath, ".freq_ratios.log"), 'w')
    freq_ratios_log.write("Script run on %s\n " % (time.strftime("%c")))

    ## Make list to keep frequency information in for histogram
    freq_ratio_data = []

    ## get male/female info for samples for working out frequencies below38      3460    50      26      0.940   0.060   0.846   0.154   -0.094  NotSexLinked

    pop_map = open(popmappath, 'r').readlines()

    sample_dict = {}
    sample_dict["females"] = []
    sample_dict["males"] = []
    fem_samples = 0
    male_samples = 0
    sample_counter = 0
    low_maf_counter = 0
    kept_loci = 0
    sample_missing_dict = {}
    sample_cov_dict = {}
    sample_cov_kept_dict = {}

    for sample in pop_map:
        name = sample.split()[0]
        sex = sample.strip().split()[1]
        sample_missing_dict[name] = 0

        #sample_counter += 1
        if sex == "F" or sex == "f" or sex == "Female" or sex == "female" or sex == "Fem" or sex == "fem":
            sample_dict["females"].append(name)
            fem_samples += 1
        elif sex == "M" or sex == "m" or sex == "Male" or sex == "male" or sex == "Mal" or sex == "mal":
            sample_dict["males"].append(name)
            male_samples += 1

    #print sample_dict        
    locus_dict = {}


    ## Find number of loci in input file
    locus_counter = 0


    for record in vcf_reader:
        locus_counter += 1
    print "Number of loci = %s" % (locus_counter)

    ## Write some general stats and input options to the log file

    freq_ratios_log.write("\n## User specified options:\n")
    freq_ratios_log.write("Input vcf: %s\n" % (myvcfpath))
    freq_ratios_log.write("pop_map file used: %s\n" % (popmappath))
    freq_ratios_log.write("Catalog file used: %s\n" % (catalog_tags_file))
    #freq_ratios_log.write("Specified female-male threshold: %s\n" % (X_or_Z_freq_threshold))
    #freq_ratios_log.write("Female-male thresh range: (+/-) %s-%s\n" % (lower_thresh, upper_thresh))
    freq_ratios_log.write("Min percentage samples present: %s\n" % (sample_presence_cutoff))
    freq_ratios_log.write("Min coverage per genotype: %s\n" % (coverage_threshold))
    freq_ratios_log.write("Min maf per locus: %s\n" % (maf_threshold))
    freq_ratios_log.write("Number of female samples = %s\n" % (fem_samples))
    freq_ratios_log.write("Number of male samples = %s\n" % (male_samples))
    freq_ratios_log.write("Number of loci = %s\n" % (locus_counter))


    ## Some quick counters

    numb_putative_Xlinked = 0
    numb_putative_Zlinked = 0
    low_data_loci = 0

    vcf_reader = vcf.Reader(open(alteredvcfpath, 'r'))

    info_rec = vcf_reader.next()

    for sample in info_rec:
        name = sample.sample
        print name
        sample_cov_dict[name] = []
        sample_cov_kept_dict[name] = []


    for record in vcf_reader:
        femREF_count = 0    ## set the counters for the reference and alternative allele (encoded as 0 in the vcf)
        femALT_count = 0
        malREF_count = 0
        malALT_count = 0
        het_males_count = 0 ## NEW
        het_females_count = 0 ## NEW
        hom_males_count = 0 ## NEW
        hom_females_count = 0 ## NEW    
        fem_none_count = 0
        male_none_count = 0
        low_cov_samples = 0
        n_genotypes = 0
        male_genotypes = 0
        fem_genotypes = 0
        number_of_samples = len(record.samples) ## this will change at each locus no???


        for sample in record.samples:

            name = sample.sample  
            sample_cov_dict[name].append(sample['DP'])


        if record.aaf[0] < maf_threshold: ## if locus has minor allele freq lower than specified threshold then skip it
            low_maf_counter += 1
            freq_ratios_log.write("\n#LOCUS_ID: %s, Locus_POS: %s\n\n" %(record.ID, record.POS)) 
            freq_ratios_log.write("Minor allele frequence of locus is lower than specified cutoff (%s)\n" % (record.aaf[0]))


        elif record.aaf >= maf_threshold:

            for sample in record.samples: 

                name = sample.sample

                ### For each sample, if the coverage is too low, remove the genotype for that individual.


                if sample['DP'] < coverage_threshold:
                    genotype = None
                    low_cov_samples += 1
                    #freq_ratios_log.write("Sample %s thrown out due to low coverage (%s)\n" % (name, sample['DP']))

                elif sample['DP'] >= coverage_threshold:
                    genotype = sample['GT']
                    sample_cov_kept_dict[name].append(sample['DP'])
                ## Now calculate the female and male frequencies separately

                if name in sample_dict["females"]: 
                    #print "Locus=", record.ID, "Female=", sample.sample, "depth=", sample['DP'], "Orig_GT=", sample['GT'], "assignedGT=", genotype                    
                    #print "FEMALE Locus=", record.ID, "n_genotypes", n_genotypes
                    if genotype == None: ## if no genotype exists
                        sample_missing_dict[name] += 1
                        
                    elif genotype == "0/0":
                        femREF_count += 2
                        n_genotypes +=1
                        fem_genotypes +=1
                        hom_females_count += 1 ## new
                    elif genotype == "0/1":
                        het_females_count += 1 ## new
                        femREF_count += 1
                        femALT_count += 1
                        n_genotypes +=1
                        fem_genotypes +=1
                    elif genotype == "1/0":
                        het_females_count += 1 ## new
                        femREF_count += 1
                        femALT_count += 1
                        n_genotypes +=1
                        fem_genotypes +=1
                    elif genotype == "1/1":
                        femALT_count += 2
                        n_genotypes +=1
                        fem_genotypes +=1
                        hom_females_count += 1 ## new
                    #print "\tN_REF=", femREF_count, "N_ALT=", femALT_count, "Nnone=", fem_none_count
                elif name in sample_dict["males"]:
                    #print "Locus=", record.ID, "Male=", sample.sample, "depth=", sample['DP'], "Orig_GT=", sample['GT'], "assignedGT=", genotype                    
                    #print "MALE Locus=", record.ID, "n_genotypes", n_genotypes
                    if genotype == None: ## if no genotype exists
                        sample_missing_dict[name] += 1
                        
                    elif genotype == "0/0":
                        malREF_count += 2
                        n_genotypes +=1
                        male_genotypes +=1
                        hom_males_count += 1 ## new
                    elif genotype == "0/1":
                        het_males_count += 1 ## new
                        malREF_count += 1
                        malALT_count += 1
                        n_genotypes +=1
                        male_genotypes +=1
                    elif genotype == "1/0":
                        het_males_count += 1
                        malREF_count += 1
                        malALT_count += 1
                        n_genotypes +=1
                        male_genotypes +=1 ## new
                    elif genotype == "1/1":
                        malALT_count += 2
                        n_genotypes +=1
                        male_genotypes +=1
                        hom_males_count += 1 ## new
                #else:
                    #print "\n##SAMPLE NAME NOT IN POP_MAP.TXT: Sample = %s" % (name)
                    #print "\tN_REF=", malREF_count, "N_ALT=", malALT_count, "Nnone=", male_none_count

            ## Filter loci that have too many missing samples, including samples thrown out due to low coverage!
            samples_at_locus = n_genotypes
            chromosomes_at_locus = n_genotypes*2
            percent_samples_present = n_genotypes/number_of_samples


            if percent_samples_present >= sample_presence_cutoff:
                
                ## Calculate frequencies per sex

                femREF_freq = femREF_count/(fem_genotypes*2)
                femALT_freq = femALT_count/(fem_genotypes*2)

                maleREF_freq = malREF_count/(male_genotypes*2)
                maleALT_freq = malALT_count/(male_genotypes*2)
                
                male_homozygosity = hom_males_count/male_genotypes
                female_homozygosity = hom_females_count/fem_genotypes
                
                male_heterozygosity = het_males_count/male_genotypes
                female_heterozygosity = het_females_count/fem_genotypes
                
                
                ## check freqs add up
                
                if not all([(femREF_freq) + (femALT_freq) == 1, (maleREF_freq) + (maleALT_freq) == 1, male_homozygosity + male_heterozygosity == 1, female_homozygosity + female_heterozygosity == 1]):
                    freq_ratios_log.write("\n******ERROR, summed frequencies do not add up to 1******\n")
                else:
                    freq_ratios_log.write("Summed frequencies are OK!")
                    kept_loci += 1


                    ## Output female stats
                    freq_ratios_log.write("\n#LOCUS_ID: %s_%s\n\n" %(record.ID, record.POS))   
                    freq_ratios_log.write("Number of female genotypes for this locus = %s\n" %(fem_genotypes))
                    freq_ratios_log.write("Female reference count = %s\n" % (femREF_count))
                    freq_ratios_log.write("Female alternative count = %s\n" % (femALT_count))
                    freq_ratios_log.write("Female reference frequency = %.3f\n" % (femREF_freq))
                    freq_ratios_log.write("Female alternative frequency = %.3f\n" % (femALT_freq))
                    freq_ratios_log.write("Female homozygosity = %.3f\n" % (female_homozygosity))
                    freq_ratios_log.write("Female heterozygosity = %.3f\n" % (female_heterozygosity))



                    ## Output male stats
                    freq_ratios_log.write("Number of male genotypes for this locus = %s\n" %(male_genotypes))
                    freq_ratios_log.write("Male reference count = %s\n" % (malREF_count))
                    freq_ratios_log.write("Male alternative count = %s\n" % (malALT_count))
                    freq_ratios_log.write("Male reference frequency = %.3f\n" % (maleREF_freq))
                    freq_ratios_log.write("Male alternative frequency = %.3f\n" % (maleALT_freq))
                    freq_ratios_log.write("Male homozygosity = %.3f\n" % (male_homozygosity))
                    freq_ratios_log.write("Male heterozygosity = %.3f\n" % (male_heterozygosity))


                    locus_dict[record.ID] = {}
                    locus_dict[record.ID]["female_freqs"] = ["%.3f" % (femREF_freq), "%.3f" % (femALT_freq)]
                    locus_dict[record.ID]["male_freqs"] = ["%.3f" % (maleREF_freq), "%.3f" % (maleALT_freq)]


                    ####### ==============================================================================================
                    ### So now I have the allele frequencies and heterozygosities for males and females - this is the new bit. 
                    ### Not using freq ratio any more . . . . 


                    freq_ratio = femREF_freq - maleREF_freq
                    #print freq_ratio
                    freq_ratio_data.append(freq_ratio)


                    ## Write files for X or Z linked loci

                    if all([female_homozygosity >= homogamtic_homozygosity_threshold, male_heterozygosity >= heterogametic_heterozygosity_threshold]):
                        linked_status = "Xlinked"
                        freq_ratios_log.write("Locus %s DOES FIT X linked criteria <------------------------\n" % (record.ID))
                        Putative_Xlinked_makers.append("%s" % (record.ID))
                        numb_putative_Xlinked += 1

                    elif all([male_homozygosity >= homogamtic_homozygosity_threshold, female_heterozygosity >= heterogametic_heterozygosity_threshold]):  ## for Z linked
                        linked_status = "Zlinked"
                        freq_ratios_log.write("Locus %s DOES FIT Z linked criteria <------------------------\n" % (record.ID))
                        Putative_Zlinked_makers.append("%s" % (record.ID))
                        numb_putative_Zlinked += 1
                    else:
                        freq_ratios_log.write("Locus %s does not fit X or Z linked criteria\n" % (record.ID))
                        linked_status ="NotSexLinked"

                    ## Write the main info file for male and female frequencies, ratios etc.

                    all_frequencies.write("%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%s\n" % (record.ID, record.POS, male_genotypes*2, fem_genotypes*2, maleREF_freq, maleALT_freq, femREF_freq, femALT_freq, freq_ratio, linked_status))


            elif percent_samples_present < sample_presence_cutoff:  ## If not enough samples at a locus then log it but don't use for female-male calculations
                freq_ratios_log.write("\n#LOCUS_ID: %s, Locus_POS: %s\n\n" %(record.ID, record.POS)) 
                freq_ratios_log.write("Number of samples at locus is lower than sample presence cutoff (%s)\n" % (samples_at_locus))
                low_data_loci += 1

    ## Look at coverage...



    ## Print some quick summary stats - also written at the end of the log file
    print "Number of samples =", number_of_samples
    print "Number of loci with too few samples = %s" % (low_data_loci)
    print "Number of loci with low MAF = %s" % (low_maf_counter)
    print "Number of loci with enough data = %s" % (kept_loci)
    print "Number of putative X linked snps = %s" % (numb_putative_Xlinked)
    print "Number of putative X linked tags = %s" % (len(set(Putative_Xlinked_makers)))
    print "Number of putative Z linked markers = %s" % (numb_putative_Zlinked)
    print "Number of putative Z linked tags = %s" % (len(set(Putative_Zlinked_makers)))

    freq_ratios_log.write("\nSUMMARY....\n\n")
    freq_ratios_log.write("Number of loci with too few samples = %s\n" % (low_data_loci))
    freq_ratios_log.write("Number of loci with enough data = %s\n" % (kept_loci))
    freq_ratios_log.write("Number of loci with low MAF = %s\n" % (low_maf_counter))
    freq_ratios_log.write("Number of putative X linked snps = %s\n" % (numb_putative_Xlinked))
    freq_ratios_log.write("Number of putative X linked tags = %s\n" % (len(set(Putative_Xlinked_makers))))
    freq_ratios_log.write("Number of putative Z linked snps = %s\n" % (numb_putative_Zlinked))
    freq_ratios_log.write("Number of putative Z linked tags = %s\n" % (len(set(Putative_Zlinked_makers))))



    ## plot histogram of missing data per sample

    plt.bar(range(len(sample_missing_dict)), sample_missing_dict.values(), align='center', color = '0.5')
    plt.xticks(range(len(sample_missing_dict)), sample_missing_dict.keys(), rotation = 90, fontsize=8)
    plt.ylabel("Number of loci with missing data", fontsize=8)
    plt.savefig("%s%s" %(myvcfpath, ".missing_data_by_sample.pdf"), format = 'pdf')
    plt.close()
    #plt.show()

    ## Plot coverage per sample

    n_groups = len(sample_cov_dict)

    Xlabs = []
    all_means = []
    all_std = []

    kept_means = []
    kept_std = []

    for sample in sorted(sample_cov_dict.keys()):
        Xlabs.append(sample)
        all_means.append(np.mean(sample_cov_dict[sample]))
        all_std.append(np.std(sample_cov_dict[sample]))
    for sample in sample_cov_kept_dict.keys():
        kept_means.append(np.mean(sample_cov_kept_dict[sample]))
        kept_std.append(np.std(sample_cov_kept_dict[sample]))

    fig, ax = plt.subplots()

    index = np.arange(n_groups)
    bar_width = 0.35

    opacity = 0.4
    error_config = {'ecolor': '0.3'}

    rects1 = plt.bar(index, all_means, bar_width,
                     alpha=opacity,
                     color='0.5',
                     #yerr=all_std,
                     error_kw=error_config,
                     label='All')

    rects2 = plt.bar(index + bar_width, kept_means, bar_width,
                     alpha=opacity,
                     color='0',
                     #yerr=kept_std,
                     error_kw=error_config,
                     label='Kept')

    plt.xlabel('Sample')
    plt.ylabel('Mean coverage')
    plt.title('Average coverage per sample')
    plt.xticks(index + bar_width, (Xlabs), rotation = 90, size = 8)
    plt.legend()

    plt.tight_layout()
    plt.savefig("%s%s" %(myvcfpath, ".coverage_by_sample.pdf"), format = 'pdf')




    ## Write fasta files of putative X or Z linked loci if there are any

    Putative_Xlinked_makers = set(Putative_Xlinked_makers)
    Putative_Zlinked_makers = set(Putative_Zlinked_makers)

    if catalog_tags_file.endswith("gz"):
        catalog = gzip.open(catalog_tags_file, 'r').readlines()
    else:
        catalog = open(catalog_tags_file, 'r').readlines()

    if numb_putative_Xlinked > 0:
        Putative_Xlinked_makers_file = open("%s%s" %(myvcfpath, ".Putative_Xlinked_makers.fa"), 'w')

        for locus in Putative_Xlinked_makers:
            for tag in catalog:
                if locus == tag.split()[2]:
                    Putative_Xlinked_makers_file.write(">X_linkedLocusID_%s\n" % (locus))
                    Putative_Xlinked_makers_file.write("%s\n" % (tag.split()[8]))
        Putative_Xlinked_makers_file.close()

    if numb_putative_Zlinked > 0:
        Putative_Zlinked_makers_file = open("%s%s" %(myvcfpath, ".Putative_Zlinked_makers.fa"), 'w')

        for locus in Putative_Zlinked_makers:
            for tag in catalog:
                if locus == tag.split()[2]:
                    Putative_Zlinked_makers_file.write(">Z_linked|LocusID_%s\n" % (locus))
                    Putative_Zlinked_makers_file.write("%s\n" % (tag.split()[8]))
        Putative_Zlinked_makers_file.close()


    ## close all unclosed files 
    freq_ratios_log.close()
    all_frequencies.close()




    print "\n***DONE!***\n"





In [128]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/Populations_combined_pops/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/pop_codes.txt"
catalog_tags = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.7
hom_thresh = 0.8
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [129]:
SL_snp_finder2(vcfpath, popmap_path, catalog_tags,hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh) ## NEW FUNCTION

Number of loci = 59892
Rjap_h_F_1
Rjap_h_F_2
Rjap_h_F_3
Rjap_h_F_4
Rjap_h_F_5
Rjap_t_F_1
Rjap_t_F_2
Rjap_t_F_3
Rjap_t_F_4
Rjap_t_F_5
Rjap_h_M_1
Rjap_h_M_2
Rjap_h_M_3
Rjap_h_M_4
Rjap_h_M_5
Rjap_h_M_6
Rjap_t_M_1
Rjap_t_M_2
Rjap_t_M_3
Rjap_t_M_4
Rjap_t_M_5
Rjap_t_M_6


KeyboardInterrupt: 

### Arvalis tests

In [64]:
vcfpath = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/sex_info.txt"
catalog_tags = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.7
hom_thresh = 0.9
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [65]:
SL_snp_finder(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 200608
Rarv_008
Rarv_037
Rarv_040
Rarv_041
Rarv_047
Rarv_053
Rarv_055
Rarv_058
Rarv_063
Rarv_070
Rarv_076
Rarv_081
Rarv_082
Rarv_083
Rarv_085
Rarv_086
Rarv_087
Rarv_088
Rarv_089
Rarv_091
Rarv_092
Rarv_094
Rarv_095
Rarv_097
Rarv_100
Rarv_101
Rarv_102
Rarv_105
Rarv_106
Rarv_108
Rarv_014
Rarv_016
Rarv_023
Rarv_033
Rarv_034
Rarv_036
Rarv_039
Rarv_049
Rarv_057
Rarv_066
Rarv_067
Rarv_084
Rarv_098
Rarv_107
Rarv_110
Rarv_115
Rarv_118
Rarv_122
Rarv_125
Rarv_128
Rarv_134
Number of samples = 51
Number of loci with too few samples = 89027
Number of loci with low MAF = 76806
Number of loci with enough data = 34774
Number of putative X linked snps = 72
Number of putative X linked tags = 69
Number of putative Z linked markers = 0
Number of putative Z linked tags = 0

***DONE!***



### Dalmatina tests

In [107]:
vcfpath = "/home/djeffrie/Data/Rdal_SL_script_tests/vcf/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/Rdal_SL_script_tests/vcf/pop_codes.txt"
catalog_tags = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.7
hom_thresh = 0.9
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [73]:
SL_snp_finder(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 9272
RdalM01
RdalM03
RdalM04
RdalM05
RdalM07
RdalM08
RdalM13
RdalM15
RdalM16
RdalM17
RdalM19
RdalM20
RdalM22
RdalM23
RdalM24
RdalM25
RdalM27
RdalM30
RdalM32
RdalM02
RdalM06
RdalM09
RdalM10
RdalM11
RdalM12
RdalM14
RdalM18
RdalM21
RdalM26
RdalM31
RdalM28
RdalM29
Number of samples = 32
Number of loci with too few samples = 1779
Number of loci with low MAF = 2901
Number of loci with enough data = 4591
Number of putative X linked snps = 150
Number of putative X linked tags = 131
Number of putative Z linked markers = 0
Number of putative Z linked tags = 0

***DONE!***



In [109]:
SL_snp_finder2(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 9272
RdalM01
RdalM03
RdalM04
RdalM05
RdalM07
RdalM08
RdalM13
RdalM15
RdalM16
RdalM17
RdalM19
RdalM20
RdalM22
RdalM23
RdalM24
RdalM25
RdalM27
RdalM30
RdalM32
RdalM02
RdalM06
RdalM09
RdalM10
RdalM11
RdalM12
RdalM14
RdalM18
RdalM21
RdalM26
RdalM31
RdalM28
RdalM29
Number of samples = 32
Number of loci with too few samples = 1779
Number of loci with low MAF = 2901
Number of loci with enough data = 4591
Number of putative X linked snps = 150
Number of putative X linked tags = 131
Number of putative Z linked markers = 0
Number of putative Z linked tags = 0

***DONE!***



### Rjap  h pop only ------------------------------------------------------------------------------------------------

In [118]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/Populations_h_pop_only/Populations_h_pop_only/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/Populations_h_pop_only/Populations_h_pop_only/pop_codes_h_pop_only.txt"
catalog_tags = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.8
hom_thresh = 1
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [119]:
SL_snp_finder(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 39479
Rjap_h_F_1
Rjap_h_F_2
Rjap_h_F_3
Rjap_h_F_4
Rjap_h_F_5
Rjap_h_M_1
Rjap_h_M_2
Rjap_h_M_3
Rjap_h_M_4
Rjap_h_M_5
Rjap_h_M_6
Number of samples = 11
Number of loci with too few samples = 1657
Number of loci with low MAF = 0
Number of loci with enough data = 37821
Number of putative X linked snps = 61
Number of putative X linked tags = 48
Number of putative Z linked markers = 4
Number of putative Z linked tags = 3

***DONE!***



### Rjap  t pop only ------------------------------------------------------------------------------------------------

In [130]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/Populations_t_pop_only/Populations_t_pop_only/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/Populations_t_pop_only/pop_codes_t_pop_only.txt"
catalog_tags = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rjap/Stacks_outs/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.8
hom_thresh = 1
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [131]:
SL_snp_finder2(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 46222
Rjap_t_F_1
Rjap_t_F_2
Rjap_t_F_3
Rjap_t_F_4
Rjap_t_F_5
Rjap_t_M_1
Rjap_t_M_2
Rjap_t_M_3
Rjap_t_M_4
Rjap_t_M_5
Rjap_t_M_6
Number of samples = 11
Number of loci with too few samples = 2419
Number of loci with low MAF = 0
Number of loci with enough data = 43802
Number of putative X linked snps = 731
Number of putative X linked tags = 662
Number of putative Z linked markers = 76
Number of putative Z linked tags = 69

***DONE!***



    This is potentially very interesting . . . but I need to check this thoroughly. First check to see if the male that falls out in these tests is always the same one because I do not find any loci if I require them to be hetrozygous in ALL males, I have to let one be hom.

### Rorn combinedG

In [87]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/Populations_pops_combined/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/pop_codes_combined.txt"
catalog_tags = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.catalog.tags.tsv.gz"
het_thresh = 0.8
hom_thresh = 0.9
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [88]:
SL_snp_finder(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 66337
Ror_h_F_1
Ror_h_F_2
Ror_h_F_3
Ror_h_F_4
Ror_h_F_5
Ror_h_F_6
Ror_o_F_1
Ror_o_F_2
Ror_o_F_3
Ror_o_F_4
Ror_o_F_5
Ror_o_F_6
Ror_h_M_1
Ror_h_M_2
Ror_h_M_3
Ror_h_M_4
Ror_h_M_5
Ror_h_M_6
Ror_o_M_1
Ror_o_M_2
Ror_o_M_3
Ror_o_M_4
Ror_o_M_5
Ror_o_M_6
Number of samples = 24
Number of loci with too few samples = 4137
Number of loci with low MAF = 0
Number of loci with enough data = 62199
Number of putative X linked snps = 6
Number of putative X linked tags = 6
Number of putative Z linked markers = 8
Number of putative Z linked tags = 7

***DONE!***



### Rorn h pop only

In [110]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/Populations_h_pop_only/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/pop_codes_h_pop_only.txt"
catalog_tags = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.catalog.tags.tsv.gz"
het_thresh = 1
hom_thresh = 1
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [111]:
SL_snp_finder2(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 36871
Ror_h_F_1
Ror_h_F_2
Ror_h_F_3
Ror_h_F_4
Ror_h_F_5
Ror_h_F_6
Ror_h_M_1
Ror_h_M_2
Ror_h_M_3
Ror_h_M_4
Ror_h_M_5
Ror_h_M_6
Number of samples = 12
Number of loci with too few samples = 962
Number of loci with low MAF = 0
Number of loci with enough data = 35908
Number of putative X linked snps = 70
Number of putative X linked tags = 59
Number of putative Z linked markers = 80
Number of putative Z linked tags = 66

***DONE!***



### Rorn o Pop only

In [112]:
vcfpath = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/Populations_o_pop_only/batch_1.vcf"

popmap_path = "/home/djeffrie/Data/RADseq/Lib_Rjapor/Rorn/Stacks_outs/pop_codes_o_pop_only.txt"
catalog_tags = "/home/djeffrie/Data/Rarvalis_SL_script_tests/stacks/batch_1.catalog.tags.tsv.gz"
het_thresh = 1
hom_thresh = 1
sample_presence_cut_off = 0.75
coverage_thresh = 6
maf_thresh = 0.05

In [113]:
SL_snp_finder(vcfpath, popmap_path, catalog_tags, hom_thresh, het_thresh, sample_presence_cut_off, coverage_thresh, maf_thresh)

Number of loci = 57369
Ror_o_F_1
Ror_o_F_2
Ror_o_F_3
Ror_o_F_4
Ror_o_F_5
Ror_o_F_6
Ror_o_M_1
Ror_o_M_2
Ror_o_M_3
Ror_o_M_4
Ror_o_M_5
Ror_o_M_6
Number of samples = 12
Number of loci with too few samples = 1551
Number of loci with low MAF = 0
Number of loci with enough data = 55817
Number of putative X linked snps = 13
Number of putative X linked tags = 12
Number of putative Z linked markers = 33
Number of putative Z linked tags = 27

***DONE!***

