In [607]:
import sys
import vcf
from collections import Counter
import pprint as pp
from pydoc import help



def VCF_MSTmap_converter(vcfpath, sex_info, sample_threshold):

    """
    USAGE:  VCF_to_MSTmap.py  <vcfpath>  <sex_info file>  <sample_presence_threshold>
         
        From VCF format, filter for linkage informative loci, duplicate each locus
        in the complimentary phase and output to MSTmap input format.
        
        Two MSTmap input files are made, one containing loci that are heterozygous in females and homozygous in males
        and a second containing loci heterozygous in males and homozygous in females. 
        
        In both files, the (uninformative) allele from the homozygous parent is removed from all genotypes to conform to the double haploid ('DH') data format in MSTmap.
        eg. 
        
        if Maternal genotpe = AB and Paternal genotype = B
            if offspring genotype == AB
                new offspring genpytpe = A
                
            else if offspring genotype == A (dipliod homozygous A)
                new offspring genpytpe = A (haploid homozygous A) 
                
        # In the latter example, the codes for diploid homozygous (AA) and haploid (A) are the same, but using population_type = DH
        # in MSTmap will ensure that the genoytpe is correctly interpreted as haploid.
           
        
        Lastly, each locus is replicated with the complimentary phase in the MSTmap input files produced. This allows for the construction of
        a linkage map without a priori knowledge of marker phase. Marker names for complimentaty phase loci are changed
        to "Compli_Loc_id" and genotype states are lower case instead of upper case.
        
        MSTmap input files are written to the same path as the VCF file called "MSTmap_input_male_het.txt" and "MSTmap_input_female_het.txt"
        
    """

    
    
    ## open vcf and alter it so vcf can parse it properly (annoying) =============
    
    alteredvcfpath = "%s%s" %(vcfpath, ".altered")
    
    myvcf = open(vcfpath, 'r').readlines()
    alteredvcf = open(alteredvcfpath, 'w')
    
    for line in myvcf:
        if "Allele Depth" not in line:
            alteredvcf.write(line)
        elif "Allele Depth" in line:
            line = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allele Depth">\n'
            alteredvcf.write(line)
    alteredvcf.close()
    
    
    ## codings for MSTmap file ===================================================
    
    codes = {"0/0":"A", "1/1":"B", "0/1":"AB", "1/0":"AB", None:"U" }
    Complicodes = {"A":"B", "B":"A", "AB":"AB", "U":"U" }
    
    
    ## Get the sex info for samples =============================================
    
    sex_file = open(sex_info, 'r').readlines()
    offspring = []
    sample_counter = 0
    
    for line in sex_file: ## Note, only samples in the sex_info file are used
        
        sample_counter += 1
        sample_name = line.split()[0]
        sex = line.split()[1]
        
            
        if sex == "F" or sex == "f":
            Mother = sample_name
            print "Mother =", sample_name
            
        elif sex == "M" or sex == "m":
            Father = sample_name
            print "Father =", sample_name
        
        elif sex == "O" or sex == "o":
            offspring.append(sample_name)
        
    offspring = sorted(offspring)
    
    
    print "Number of offspring =", len(offspring)
    
            
    
    ## Get and convert sample data ===============================================
    
    Loc_Ids = []
    Mat_Pat_het_discards = 0
    Mat_pat_hom_discards = 0
    Mat_pat_missing_discards = 0
    missing_data_discards = 0
    total_loci = 0
    

    
    
    ## Lists (of info for MSTmap file lines) for the loci that pass filters
       
    male_het_loci_kept = []
    female_het_loci_kept = []
    
    male_het_loci_kept_compli = []
    female_het_loci_kept_compli = []
    
    Male_ids = []
    Female_ids = []
            
    
    myvcf = vcf.Reader(open(alteredvcfpath, 'r')) ## read in vcf

    info = "no"
    
    for record in myvcf:
        
        total_loci += 1
        
        if record.num_called < (sample_counter)*sample_threshold: ## discard if too much missing data
            missing_data_discards += 1
        
        elif record.num_called >= (sample_counter)*sample_threshold: ## if locus has data for more than 0.75 samples

            Loc_Id = "%s_%s" % (record.ID, record.POS)
            
            
            offspring_genotypes = {}
            
            ## get genotypes for each sample at this locus and convert to MSTmap format
            
            
            
            for sample in record.samples: 
                
                name = sample.sample
                genotype = sample['GT']
                
                
                ## Get the MST codes and complimentary codes for the informative loci
                if name == Mother:
                    Maternal_genotype = codes[genotype]
                    
                elif name == Father:
                    Paternal_genotype = codes[genotype]
                    
                elif name in offspring:
                    offspring_genotypes[name] = codes[genotype]
            
            
                
            
            ### Filter for informative loci, remove uninformative allele and produce complimentary phase loci
                
            
            femmap_offspring_haploid_genotypes = []
            femmap_offspring_haploid_genotypes_compli = []
            
            malemap_offspring_haploid_genotypes = []
            malemap_offspring_haploid_genotypes_compli = []
            
            ## Female mapping data    
            if Maternal_genotype == "AB" and Paternal_genotype == "A" or Paternal_genotype == "B":

                female_het_loci_kept.append("\n%s" % (Loc_Id)) ## make line for the record (not used unless criteria below are met)
                female_het_loci_kept_compli.append("\ncompli_%s" % (Loc_Id)) ## make line for the complimentary record (not used unless criteria below are met)
                
                for sample in offspring: ## in the order of the samples in the ordered offspring name list
                    
                    if offspring_genotypes[sample] == "A":
                        femmap_offspring_haploid_genotypes.append("\tA")
                        femmap_offspring_haploid_genotypes_compli.append("\tb")
                    elif offspring_genotypes[sample] == "B":
                        femmap_offspring_haploid_genotypes.append("\tB")
                        femmap_offspring_haploid_genotypes_compli.append("\ta")
                    elif offspring_genotypes[sample] == "AB" and Paternal_genotype == "A":
                        femmap_offspring_haploid_genotypes.append("\tB")
                        femmap_offspring_haploid_genotypes_compli.append("\ta")
                    elif offspring_genotypes[sample] == "AB" and Paternal_genotype == "B":
                        femmap_offspring_haploid_genotypes.append("\tA")
                        femmap_offspring_haploid_genotypes_compli.append("\tb")
                    elif offspring_genotypes[sample] == "U":
                        femmap_offspring_haploid_genotypes.append("\tU")
                        femmap_offspring_haploid_genotypes_compli.append("\tU")

                            
                female_het_loci_kept.append("".join(femmap_offspring_haploid_genotypes))
                female_het_loci_kept_compli.append("".join(femmap_offspring_haploid_genotypes_compli)) ## complimentary phase ???? 
                
                
            
            ## Male mapping data            
            elif Paternal_genotype == "AB" and Maternal_genotype == "A" or Maternal_genotype == "B":
                     
                male_het_loci_kept.append("\n%s" % (Loc_Id)) ## make line for the record (not used unless criteria below are met)
                male_het_loci_kept_compli.append("\ncompli_%s" % (Loc_Id)) ## make line for the complimentary record (not used unless criteria below are met)
            
                for sample in offspring:
                    
                    if offspring_genotypes[sample] == "A":
                        malemap_offspring_haploid_genotypes.append("\tA")
                        malemap_offspring_haploid_genotypes_compli.append("\tb")
                    elif offspring_genotypes[sample] == "B":
                        malemap_offspring_haploid_genotypes.append("\tB")
                        malemap_offspring_haploid_genotypes_compli.append("\ta")
                    elif offspring_genotypes[sample] == "AB" and Maternal_genotype == "A":
                        malemap_offspring_haploid_genotypes.append("\tB")
                        malemap_offspring_haploid_genotypes_compli.append("\ta")
                    elif offspring_genotypes[sample] == "AB" and Maternal_genotype == "B":
                        malemap_offspring_haploid_genotypes.append("\tA")
                        malemap_offspring_haploid_genotypes_compli.append("\tb")
                    elif offspring_genotypes[sample] == "U":
                        malemap_offspring_haploid_genotypes.append("\tU")
                        malemap_offspring_haploid_genotypes_compli.append("\tU")                            

                male_het_loci_kept.append("".join(malemap_offspring_haploid_genotypes))
                male_het_loci_kept_compli.append("".join(malemap_offspring_haploid_genotypes_compli))
                

            ## Discard loci that are heterozygous in both
            elif Paternal_genotype == "AB" and Maternal_genotype == "AB":
                
                Mat_Pat_het_discards += 1
                   
            elif Paternal_genotype == "U" or Maternal_genotype == "U":
                
                Mat_pat_missing_discards += 1
                
            elif Paternal_genotype == "A" or Paternal_genotype == "B" and Maternal_genotype == "A" or Maternal_genotype == "B":
                
                Mat_pat_hom_discards += 1
                
               
                                    ## WRITING FILE ##
    ## Set up the new file ==============================================================
    
    fem_het_MST_file = open("%s/MSTmap_input_female_het.txt" % (vcfpath.rpartition("/")[0]), 'w') ## in same path as VCF
    male_het_MST_file = open("%s/MSTmap_input_male_het.txt" % (vcfpath.rpartition("/")[0]), 'w') ## in same path as VCF
                                          
    ### Headers ==========================================================================
    
    ## Fem_het
    file_headers = "\
population_type DH\n\
population_name Female_map\n\
distance_function kosambi\n\
cut_off_p_value 0.000005\n\
no_map_dist 30\n\
no_map_size 1\n\
missing_threshold 0.25\n\
estimation_before_clustering no\n\
detect_bad_data yes\n\
objective_function ML\n\
number_of_loci %s\n\
number_of_individual %s\n\n" % (len(female_het_loci_kept), len(offspring))
            
    
    fem_het_MST_file.write(file_headers)
        
    sample_headers = "locus_name\t%s" % ("\t".join(offspring))
    fem_het_MST_file.write(sample_headers)
                                          
        
    for line in female_het_loci_kept:
        fem_het_MST_file.write(line) 
    for line in female_het_loci_kept_compli:
        fem_het_MST_file.write(line)
        
    fem_het_MST_file.close()

                                          
    ## Male_het
    file_headers = "\
population_type DH\n\
population_name Male_map\n\
distance_function kosambi\n\
cut_off_p_value 0.000005\n\
no_map_dist 30\n\
no_map_size 1\n\
missing_threshold 0.25\n\
estimation_before_clustering no\n\
detect_bad_data yes\n\
objective_function ML\n\
number_of_loci %s\n\
number_of_individual %s\n\n" % (len(male_het_loci_kept), len(offspring))
            
    
    male_het_MST_file.write(file_headers)
        
    sample_headers = "locus_name\t%s" % ("\t".join(offspring))
    male_het_MST_file.write(sample_headers)
                                          
    
    for line in male_het_loci_kept:
        male_het_MST_file.write(line)
    for line in male_het_loci_kept_compli:
        male_het_MST_file.write(line)
        
    male_het_MST_file.close()
                                          
                                          
    print "Total number of loci in VCF =", total_loci
    print "Number of loci thrown out due to low sample presence =", missing_data_discards
    print "Number of loci thrown out becuase heterozygous in mother and father =", Mat_Pat_het_discards
    print "Number of loci thrown out because homozygous in mother and father =", Mat_pat_hom_discards
    print "Number of loci thrown out because missing in mother or father =", Mat_pat_missing_discards
    print "loci in MSTmap male map (excl. complimentary phase) =", len(male_het_loci_kept)/2
    print "loci in MSTmap female map (excl. complimentary phase) =", len(female_het_loci_kept)/2
    

In [None]:
if len(sys.argv) == 1:
    # sys.exit(help(VCF_MSTmap_converter))
    print VCF_MSTmap_converter.__doc__
elif len(sys.argv) < 4: ## If not enough args are supplied print error message
    sys.exit("\n##Error, not enough arguments, run script with no arguments to see help message\n")

elif len(sys.argv) == 4:
    specified_vcfpath = sys.argv[1]
    sex_info_file = sys.argv[2]
    sample_presence_threshold = float(sys.argv[3])

    VCF_MSTmap_converter(specified_vcfpath, sex_info_file, sample_presence_threshold)


### For testing =======================================================

specified_vcfpath = "/home/djeffrie/Data/Pnig_RAD/Stacks_outs/populations_better/batch_1_test.vcf"
sex_info_file = "/home/djeffrie/Data/Pnig_RAD/Stacks_outs/pop_codes_linkage.txt"
sample_presence_threshold = 0.75

VCF_MSTmap_converter(specified_vcfpath, sex_info_file, sample_presence_threshold)

