## Making the dadi input file from Stacks VCF output

In [118]:
### Note that this function only works for vcf files containing 1 outgrop and one in group!

import vcf
import re

def vcf_to_dadi(VCF_file, outgrp_string, out_file):
    
    VCF_handle = open(VCF_file, 'r') ## make handle
    
    my_vcf = vcf.Reader(VCF_handle) ## read in data with vcf module
    
    ### Get population names (from first snp in file), and write header line to input file ###
    
    snp = my_vcf.next()
    names = []
    for sample in snp.samples:
        names.append(sample.sample.split("_")[0].strip("0123456789-"))
    names = sorted(set(names))
    print "Populations (n="+str(len(names))+"):" 
    print ', '.join(names)
    header_line = "In_REF\tOut_REF\tAllele1\t"+'\t'.join(names)+"\tAllele2\t"+'\t'.join(names)+"\tTag_ID\tSite"
    
    dadi_input = open(out_file, 'w')
    
    dadi_input.write(header_line+"\n") ## write header line
    
    
    ##### Now get the data ######
    
    VCF_handle = open(VCF_file, 'r') ## make handle
    my_vcf = vcf.Reader(VCF_handle) ## Parse file again, as .next() will have moved the iterator on
    
    ## For each SNP in the file ...
    
    for snp in my_vcf:

        GT_dict = {}
        
        ## get outgroup ref allele and data
        
        out_ref_count = 0
        out_alt_count = 0
        
        for sample in snp.samples:
            if sample.sample.startswith(outgrp_string):
                if sample['GT'] == "0/0":
                    out_ref_count += 2
                elif sample['GT'] == "0/1":
                    out_ref_count += 1
                    out_alt_count += 1  ## counting one allele twice
                elif sample['GT'] == "1/0":
                    out_ref_count += 1
                    out_alt_count += 1
                elif sample['GT'] == "1/1":
                    out_alt_count += 2
        
        if out_ref_count <= out_alt_count:
            OUT_REF_ALLELE = snp.ALT[0]
            GT_dict[outgrp_string]=(out_alt_count,out_ref_count)
        elif out_alt_count < out_ref_count:
            OUT_REF_ALLELE = snp.REF
            GT_dict[outgrp_string]=(out_ref_count,out_alt_count) 
        
        
        #### for all non outgroup populations ####


        for population in names:
            in_ref_count = 0
            in_alt_count = 0
                        
            if population != outgrp_string:
                
                GT_dict[population] = ()

                for sample in snp.samples:
                    if not all([sample.sample.startswith(a) for a in ['RM31', 'RM32']):
                    if not sample.sample.startswith(outgrp_string):
                        
                        if sample['GT'] == "0/0":
                            in_ref_count += 2
                        elif sample['GT'] == "0/1":
                            in_ref_count += 1
                            in_alt_count += 1
                        elif sample['GT'] == "1/0":
                            in_ref_count += 1
                            in_alt_count += 1
                        elif sample['GT'] == "1/1":
                            in_alt_count += 2
                            
                if in_alt_count <= in_ref_count:
                    IN_REF_ALLELE = snp.REF
                    IN_ALT_ALLELE = snp.ALT[0]
                    GT_dict[population]=(in_ref_count,in_alt_count)   
                
                elif in_ref_count < in_alt_count:
                    IN_REF_ALLELE = snp.ALT[0]
                    IN_ALT_ALLELE = snp.REF
                    GT_dict[population]=(in_alt_count, in_ref_count)  
            
        in_REF_counts =[]
        in_ALT_counts =[]
        
        for population in names:
            if population != outgrp_string:
                in_REF_counts.append(str(GT_dict[population][0]))
                in_ALT_counts.append(str(GT_dict[population][1]))
    
            elif population == outgrp_string:
                in_REF_counts.append(str(GT_dict[population][1]))
                in_ALT_counts.append(str(GT_dict[population][0]))

        snp_line = "-"+str(IN_REF_ALLELE)+"-"+"\t"+"-"+str(OUT_REF_ALLELE)+"-"+"\t"+str(IN_REF_ALLELE)+"\t"+str('\t'.join(in_REF_counts))+"\t"+str(IN_ALT_ALLELE)+"\t"+str("\t".join(in_ALT_counts))+"\t"+str(snp.ID)+"\t"+str(snp.POS)
        dadi_input.write(snp_line+"\n")

In [120]:
vcf_to_dadi("/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/dadi/TESTER.vcf", "RM", "/media/dan/34D5D1CE642D7E36/2013076_Hanfling_Bernd/dadi/TEST.vcf")

Populations (n=35):
BF, BOR, CA-LK, CAKE, COP, DEND, EP, FM, GBP, GF, H2K, H2KH, HK, HOLT, LAS, MOAT, MY, OBY, OU, PED, POLEN, PRO, RM, SD, SK, STEC, STYV, SWED, SWED-H, SWEDG, TROM, TU, V, VIKKHY, WEN
