In [1]:
import pandas as pd
import sys
from pysam import VariantFile

In [2]:
in_vcf = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/endpoint/merged_allsites.vcf"
out_vcf = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/endpoint/merged_allsites.GETS.vcf"

In [3]:
%%bash

head -39 /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/endpoint/merged_allsites.vcf | tail -1

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	D2A1	D2B3	D2C1	D2C3	D3A1	D3A3	D4A3	D4B4	D5B3	D5C1	D5C3	H2A3	H2B4	H2C3	H3A4	H3C4	H4A4	H4C2	H5A2	H5A3	H5A4	H5C2	ancestor


# Micro-evolution experiment

## Create new VCF with SH, TP, GET infos

### 100 generations endpoint only

In [4]:
%%time

vcf_in = VariantFile(in_vcf)  # auto-detect input format
vcf_out = VariantFile(out_vcf, 'w', header=vcf_in.header)

# For writing out in a new VCF file
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_out.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele.")])

# Necessary to allow setting new tags in the variant records and record samples
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_in.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele (ref or alt).")])

# DEBUG : check if header was modified indeed
print(vcf_out.header)

ancestor = "ancestor"

# For record in sample
for n, rec in enumerate(vcf_in) :
    if n % 1000000 == 0 :
        print("Elapsed records: {}".format(n))
    
    # DEBUG
    """
    if rec.pos == 11196 :
        print(rec)
    """
    
    cAlleles = {k:0 for k in rec.alleles} # For INFO SH
    allAlleles = {} # Will store tuples for each sample for later counting
    
    # For sample in record
    for k, sm in rec.samples.items() :
        cCurrentAlleles = []
            
        # DEBUG
        """
        if rec.pos == 11196 :
            print(k, cCurrentAlleles)
        """
           
        try : # In case no AD -> go to except
            for al, dp in zip(rec.alleles, sm["AD"]) :
                # Check each allele if 0.1 < AF < 0.9 or if called by GATK --> add to the cAlleles and cCurrentAlleles
                freq = 0.0
                try :
                    freq = dp/sm["DP"]
                except :
                    freq = dp/sum(sm["AD"])

                if freq >= 0.1 or al in sm.alleles :
                    if al not in cAlleles :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
        except :
            try : # In case no DP --> except
                if len(set(sm.alleles)) == 1 : # If is homozygous
                    al = set(sm.alleles).pop()
                    if al not in cAlleles :
                        cAlleles[al] = 1
                    else :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
            except :
                print(rec)
                print("Unexpected error:", sys.exc_info()[0])
                #break
                pass

        allAlleles[k] = tuple(cCurrentAlleles) # Create a tuple (unchangeable and unsortable --> less mistakes)
        # Tuple contains all alleles found in the sample (with 0.1 < AF < 0.9 OR called by GATK)
        
        # DEBUG
        """
        if rec.pos == 54641 :
            print(k, allAlleles[k])
        """
        
        if len(allAlleles[k]) == 0 or None in sm.alleles : # If no alleles found or uncalled genotype
            sm["TP"] = "UC" # TYPE = Uncalled
        else : # At least one allele found
            if len(allAlleles[k]) == 1 : # Mono-allelic
                if allAlleles[k][0] == rec.ref :
                    sm["TP"] = "RF" # TYPE = Hom Ref 
                else :
                    if len(allAlleles[k][0]) == 1 and allAlleles[k][0] != "*" :
                        sm["TP"] = "AS" # TYPE = Alternative SNV
                    else :
                        if allAlleles[k][0] == "<NON_REF>" :
                            sm["TP"] = "AU" # TYPE = Alternative UNKNOWN
                        else :
                            sm["TP"] = "AI" # Alternative INDEL
            elif len(allAlleles[k]) == 2 : # TYPE = Bi-allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "BS" # Bi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "BU" # TYPE = Bi-allelic UNKNOWN
                else :
                    sm["TP"] = "BI" # TYPE = Bi-allelic INDEL
            else : # Multi-Allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "MS" # TYPE = Multi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "MU" # TYPE = Multi-allelic UNKNOWN
                else :
                    sm["TP"] = "MI" # TYPE = Multi-allelic INDEL
    
    # DEBUG
    """
    try :
        Anc = rec.samples[ancestor]["TP"]
    except :
        print("NO TP")
        print(allAlleles)
        print(n, rec)

    try :
        AncAl = allAlleles[ancestor]
    except :
        print("NO KEYS")
        print(allAlleles)
        print(n, rec)
    """
    
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(allAlleles)
        print(n, rec)
    """
    
    # Add tag GET to each sample
    typeDict = {"UC":0, "AU":0, "BU":0, "MU":0, "RF":1, "AS":1, "AI": 1, "BI":2, "BS":2, "MI":3, "MS":3}
    AncAl = allAlleles[ancestor]
    Anc = rec.samples[ancestor]["TP"]
    A = typeDict[Anc]
    for k, sm in rec.samples.items() :
        if sm.name == ancestor :
            continue
        else :
            OffAl = allAlleles[k]
            Off = sm["TP"]
            O = typeDict[Off]
            
            intersect = set(AncAl) & set(OffAl)
            NumShared = len(intersect)
            AllShared = bool(NumShared == len(set(AncAl))) # If all ancestors alleles are found in the offspring (and vice-versa)
            NoneShared = bool(NumShared == 0) # If no common alleles
            
            if A == 0 or O == 0 :
                sm["GET"] = "U"
            elif A == 1 and O == 1 :
                if AllShared : # Same allele --> A or 0
                    if Off == "RF" : # Ref --> 0
                        sm["GET"] = "0"
                    else : # Not ref --> A
                        sm["GET"] = "A" 
                else : # Different allele --> S1
                    sm["GET"] = "S1"
            elif A == 1 and O == 2 : # Offspring heterozygous
                if NoneShared : # No common allele --> S4
                    sm["GET"] = "S4"
                else : # 1 shared (cannot be more than one because 2 = biallelic) --> N
                    sm["GET"] = "N"
            elif A == 1 and O == 3 : # Offspring multi-allelic --> always suspect S4
                sm["GET"] = "S4"
            elif A == 2 and O == 1 : # Ancestor heterozygous and offspring homozygous
                if NoneShared : # No allele shared --> suspect
                    sm["GET"] = "S3"
                else : # 1 allele in common --> cannot be more
                    sm["GET"] = "L"
            elif A == 2 and O == 2 : # both heterozygous
                if NoneShared : # No common allele --> S2
                    sm["GET"] = "S2"
                else : 
                    if NumShared == 1 : # One in common
                        sm["GET"] = "M"
                    else : # Two in common --> identical site <-> AllShared
                        sm["GET"] = "C"
            elif A == 2 and O == 3 : # Ancestor biallelic het and offspring multiallelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S2" 
                else :
                    if NumShared == 1 : # 1 common allele --> more than 2 events involved : suspect
                        sm["GET"] = "S2"
                    else : # 2 common alleles --> Increased heterozygosity
                        sm["GET"] = "I"
            elif A == 3 and O == 1 : # Ancestor multi-allelic and offspring homozygous
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3" 
                else : # 1 shared --> lost heterozygosity
                    sm["GET"] = "L"
            elif A == 3 and O == 2 : # Ancestor multi-allelic and offspring bi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    sm["GET"] = "D"
            elif A == 3 and O == 3 : # Both multi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    if AllShared :
                        sm["GET"] = "C"
                    else :
                        if len(AncAl) > len(OffAl) : # More alleles in ancestor : Decreased Het
                            sm["GET"] = "D"
                        else : # More alleles in offspring : Increased Het
                            sm["GET"] = "I"
            else : # Should not come here
                # DEBUG
                print(n, allAlleles)
                print(k, rec)
                print("----")
                
                pass
    
    # Add UNCALLED samples to the dictionary for later sharedness
    for sample in vcf_out.header.samples :
        if sample not in allAlleles.keys() :
            allAlleles[sample] = tuple([])
        
    for k, sm in rec.samples.items() :
        # For each samples count which samples have the same alleles tuple --> same variants found at same position 
        sm["SH"] = sum(al == allAlleles[k] for al in allAlleles.values()) - 1 # minus 1 because do not count itself
        
    # Create a list with the number of times an allele is shared
    # When an allele is found in a sample the cAlleles key is incremented by 1 
    sh = []
    for al in rec.alleles :
        sh.append(cAlleles[al])
    rec.info["SH"] = sh # Set the SH tag in the current record for later output to VCF
    
    # Write out the record
    try :
        vcf_out.write(rec)
    except :
        print("Here: ", n)
        print(rec, "\n")
        
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(rec)
    """
    
    # DEBUG
    """
    if len(rec.alleles) > 1 : # DEBUGGING
        try :
            out = "\n".join("Sample({}: {} {} {})".format(k, "/".join(str(x) for x in s["GT"]), s["SH"], s["TP"]) for k, s in rec.samples.items())
        except :
            print(rec)
    """

vcf_in.close()
vcf_out.close()

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (b

Elapsed records: 73000000
Elapsed records: 74000000
Elapsed records: 75000000
Elapsed records: 76000000
Elapsed records: 77000000
Elapsed records: 78000000
Elapsed records: 79000000
Elapsed records: 80000000
Elapsed records: 81000000
Elapsed records: 82000000
Elapsed records: 83000000
Elapsed records: 84000000
Elapsed records: 85000000
Elapsed records: 86000000
Elapsed records: 87000000
Elapsed records: 88000000
Elapsed records: 89000000
Elapsed records: 90000000
Elapsed records: 91000000
Elapsed records: 92000000
Elapsed records: 93000000
Elapsed records: 94000000
Elapsed records: 95000000
Elapsed records: 96000000
Elapsed records: 97000000
Elapsed records: 98000000
Elapsed records: 99000000
Elapsed records: 100000000
CPU times: user 10h 5min 26s, sys: 3min 23s, total: 10h 8min 50s
Wall time: 10h 10min 5s


# Micro-evolution experiment

## Create new VCF with SH, TP, GET infos

### All samples together (50 and 100 gens)

In [5]:
in_vcf = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/genotype_allsamples/merged_allsites.vcf"
out_vcf = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/genotype_allsamples/merged_allsites.GETS.vcf"

In [6]:
%%time

vcf_in = VariantFile(in_vcf)  # auto-detect input format
vcf_out = VariantFile(out_vcf, 'w', header=vcf_in.header)

# For writing out in a new VCF file
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_out.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele.")])

# Necessary to allow setting new tags in the variant records and record samples
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_in.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele (ref or alt).")])

# DEBUG : check if header was modified indeed
print(vcf_out.header)

ancestor = "ancestor"

# For record in sample
for n, rec in enumerate(vcf_in) :
    if n % 1000000 == 0 :
        print("Elapsed records: {}".format(n))
    
    # DEBUG
    """
    if rec.pos == 11196 :
        print(rec)
    """
    
    cAlleles = {k:0 for k in rec.alleles} # For INFO SH
    allAlleles = {} # Will store tuples for each sample for later counting
    
    # For sample in record
    for k, sm in rec.samples.items() :
        cCurrentAlleles = []
            
        # DEBUG
        """
        if rec.pos == 11196 :
            print(k, cCurrentAlleles)
        """
           
        try : # In case no AD -> go to except
            for al, dp in zip(rec.alleles, sm["AD"]) :
                # Check each allele if 0.1 < AF < 0.9 or if called by GATK --> add to the cAlleles and cCurrentAlleles
                freq = 0.0
                try :
                    freq = dp/sm["DP"]
                except :
                    freq = dp/sum(sm["AD"])

                if freq >= 0.1 or al in sm.alleles :
                    if al not in cAlleles :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
        except :
            try : # In case no DP --> except
                if len(set(sm.alleles)) == 1 : # If is homozygous
                    al = set(sm.alleles).pop()
                    if al not in cAlleles :
                        cAlleles[al] = 1
                    else :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
            except :
                print(rec)
                print("Unexpected error:", sys.exc_info()[0])
                #break
                pass

        allAlleles[k] = tuple(cCurrentAlleles) # Create a tuple (unchangeable and unsortable --> less mistakes)
        # Tuple contains all alleles found in the sample (with 0.1 < AF < 0.9 OR called by GATK)
        
        # DEBUG
        """
        if rec.pos == 54641 :
            print(k, allAlleles[k])
        """
        
        if len(allAlleles[k]) == 0 or None in sm.alleles : # If no alleles found or uncalled genotype
            sm["TP"] = "UC" # TYPE = Uncalled
        else : # At least one allele found
            if len(allAlleles[k]) == 1 : # Mono-allelic
                if allAlleles[k][0] == rec.ref :
                    sm["TP"] = "RF" # TYPE = Hom Ref 
                else :
                    if len(allAlleles[k][0]) == 1 and allAlleles[k][0] != "*" :
                        sm["TP"] = "AS" # TYPE = Alternative SNV
                    else :
                        if allAlleles[k][0] == "<NON_REF>" :
                            sm["TP"] = "AU" # TYPE = Alternative UNKNOWN
                        else :
                            sm["TP"] = "AI" # Alternative INDEL
            elif len(allAlleles[k]) == 2 : # TYPE = Bi-allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "BS" # Bi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "BU" # TYPE = Bi-allelic UNKNOWN
                else :
                    sm["TP"] = "BI" # TYPE = Bi-allelic INDEL
            else : # Multi-Allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "MS" # TYPE = Multi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "MU" # TYPE = Multi-allelic UNKNOWN
                else :
                    sm["TP"] = "MI" # TYPE = Multi-allelic INDEL
    
    # DEBUG
    """
    try :
        Anc = rec.samples[ancestor]["TP"]
    except :
        print("NO TP")
        print(allAlleles)
        print(n, rec)

    try :
        AncAl = allAlleles[ancestor]
    except :
        print("NO KEYS")
        print(allAlleles)
        print(n, rec)
    """
    
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(allAlleles)
        print(n, rec)
    """
    
    # Add tag GET to each sample
    typeDict = {"UC":0, "AU":0, "BU":0, "MU":0, "RF":1, "AS":1, "AI": 1, "BI":2, "BS":2, "MI":3, "MS":3}
    AncAl = allAlleles[ancestor]
    Anc = rec.samples[ancestor]["TP"]
    A = typeDict[Anc]
    for k, sm in rec.samples.items() :
        if sm.name == ancestor :
            continue
        else :
            OffAl = allAlleles[k]
            Off = sm["TP"]
            O = typeDict[Off]
            
            intersect = set(AncAl) & set(OffAl)
            NumShared = len(intersect)
            AllShared = bool(NumShared == len(set(AncAl))) # If all ancestors alleles are found in the offspring (and vice-versa)
            NoneShared = bool(NumShared == 0) # If no common alleles
            
            if A == 0 or O == 0 :
                sm["GET"] = "U"
            elif A == 1 and O == 1 :
                if AllShared : # Same allele --> A or 0
                    if Off == "RF" : # Ref --> 0
                        sm["GET"] = "0"
                    else : # Not ref --> A
                        sm["GET"] = "A" 
                else : # Different allele --> S1
                    sm["GET"] = "S1"
            elif A == 1 and O == 2 : # Offspring heterozygous
                if NoneShared : # No common allele --> S4
                    sm["GET"] = "S4"
                else : # 1 shared (cannot be more than one because 2 = biallelic) --> N
                    sm["GET"] = "N"
            elif A == 1 and O == 3 : # Offspring multi-allelic --> always suspect S4
                sm["GET"] = "S4"
            elif A == 2 and O == 1 : # Ancestor heterozygous and offspring homozygous
                if NoneShared : # No allele shared --> suspect
                    sm["GET"] = "S3"
                else : # 1 allele in common --> cannot be more
                    sm["GET"] = "L"
            elif A == 2 and O == 2 : # both heterozygous
                if NoneShared : # No common allele --> S2
                    sm["GET"] = "S2"
                else : 
                    if NumShared == 1 : # One in common
                        sm["GET"] = "M"
                    else : # Two in common --> identical site <-> AllShared
                        sm["GET"] = "C"
            elif A == 2 and O == 3 : # Ancestor biallelic het and offspring multiallelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S2" 
                else :
                    if NumShared == 1 : # 1 common allele --> more than 2 events involved : suspect
                        sm["GET"] = "S2"
                    else : # 2 common alleles --> Increased heterozygosity
                        sm["GET"] = "I"
            elif A == 3 and O == 1 : # Ancestor multi-allelic and offspring homozygous
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3" 
                else : # 1 shared --> lost heterozygosity
                    sm["GET"] = "L"
            elif A == 3 and O == 2 : # Ancestor multi-allelic and offspring bi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    sm["GET"] = "D"
            elif A == 3 and O == 3 : # Both multi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    if AllShared :
                        sm["GET"] = "C"
                    else :
                        if len(AncAl) > len(OffAl) : # More alleles in ancestor : Decreased Het
                            sm["GET"] = "D"
                        else : # More alleles in offspring : Increased Het
                            sm["GET"] = "I"
            else : # Should not come here
                # DEBUG
                print(n, allAlleles)
                print(k, rec)
                print("----")
                
                pass
    
    # Add UNCALLED samples to the dictionary for later sharedness
    for sample in vcf_out.header.samples :
        if sample not in allAlleles.keys() :
            allAlleles[sample] = tuple([])
        
    for k, sm in rec.samples.items() :
        # For each samples count which samples have the same alleles tuple --> same variants found at same position 
        sm["SH"] = sum(al == allAlleles[k] for al in allAlleles.values()) - 1 # minus 1 because do not count itself
        
    # Create a list with the number of times an allele is shared
    # When an allele is found in a sample the cAlleles key is incremented by 1 
    sh = []
    for al in rec.alleles :
        sh.append(cAlleles[al])
    rec.info["SH"] = sh # Set the SH tag in the current record for later output to VCF
    
    # Write out the record
    try :
        vcf_out.write(rec)
    except :
        print("Here: ", n)
        print(rec, "\n")
        
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(rec)
    """
    
    # DEBUG
    """
    if len(rec.alleles) > 1 : # DEBUGGING
        try :
            out = "\n".join("Sample({}: {} {} {})".format(k, "/".join(str(x) for x in s["GT"]), s["SH"], s["TP"]) for k, s in rec.samples.items())
        except :
            print(rec)
    """

vcf_in.close()
vcf_out.close()

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (b

Elapsed records: 67000000
Elapsed records: 68000000
Elapsed records: 69000000
Elapsed records: 70000000
Elapsed records: 71000000
Elapsed records: 72000000
Elapsed records: 73000000
Elapsed records: 74000000
Elapsed records: 75000000
Elapsed records: 76000000
Elapsed records: 77000000
Elapsed records: 78000000
Elapsed records: 79000000
Elapsed records: 80000000
Elapsed records: 81000000
Elapsed records: 82000000
Elapsed records: 83000000
Elapsed records: 84000000
Elapsed records: 85000000
Elapsed records: 86000000
Elapsed records: 87000000
Elapsed records: 88000000
Elapsed records: 89000000
Elapsed records: 90000000
Elapsed records: 91000000
Elapsed records: 92000000
Elapsed records: 93000000
Elapsed records: 94000000
Elapsed records: 95000000
Elapsed records: 96000000
Elapsed records: 97000000
Elapsed records: 98000000
Elapsed records: 99000000
Elapsed records: 100000000
CPU times: user 18h 46min 14s, sys: 5min 1s, total: 18h 51min 16s
Wall time: 18h 52min 25s


# Genome Repair experiment

## Create new VCF with SH, TP, GET infos

In [7]:
in_vcf = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/jointgenotyping/merged_allsites.vcf"
out_vcf = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/jointgenotyping/merged_allsites.GETS.vcf"

In [8]:
%%time

vcf_in = VariantFile(in_vcf)  # auto-detect input format
vcf_out = VariantFile(out_vcf, 'w', header=vcf_in.header)

# For writing out in a new VCF file
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_out.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_out.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele.")])

# Necessary to allow setting new tags in the variant records and record samples
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "GET"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variation compared to the ancestor sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "SH"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of samples sharing the same variant as the considered sample.")])
vcf_in.header.add_meta(key="FORMAT", items=[("ID", "TP"), ("Number", "1"), ("Type", "String"), ("Description", "Type of variant.")])
vcf_in.header.add_meta(key="INFO", items=[("ID", "SH"), ("Number", "R"), ("Type", "Integer"), ("Description", "Number of samples sharing the same allele (ref or alt).")])

# DEBUG : check if header was modified indeed
print(vcf_out.header)

ancestor = "ancestor"

# For record in sample
for n, rec in enumerate(vcf_in) :
    if n % 1000000 == 0 :
        print("Elapsed records: {}".format(n))
    
    # DEBUG
    """
    if rec.pos == 11196 :
        print(rec)
    """
    
    cAlleles = {k:0 for k in rec.alleles} # For INFO SH
    allAlleles = {} # Will store tuples for each sample for later counting
    
    # For sample in record
    for k, sm in rec.samples.items() :
        cCurrentAlleles = []
            
        # DEBUG
        """
        if rec.pos == 11196 :
            print(k, cCurrentAlleles)
        """
           
        try : # In case no AD -> go to except
            for al, dp in zip(rec.alleles, sm["AD"]) :
                # Check each allele if 0.1 < AF < 0.9 or if called by GATK --> add to the cAlleles and cCurrentAlleles
                freq = 0.0
                try :
                    freq = dp/sm["DP"]
                except :
                    freq = dp/sum(sm["AD"])

                if freq >= 0.1 or al in sm.alleles :
                    if al not in cAlleles :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
        except :
            try : # In case no DP --> except
                if len(set(sm.alleles)) == 1 : # If is homozygous
                    al = set(sm.alleles).pop()
                    if al not in cAlleles :
                        cAlleles[al] = 1
                    else :
                        cAlleles[al] += 1
                    if al not in cCurrentAlleles :
                        cCurrentAlleles.append(al)
            except :
                print(rec)
                print("Unexpected error:", sys.exc_info()[0])
                #break
                pass

        allAlleles[k] = tuple(cCurrentAlleles) # Create a tuple (unchangeable and unsortable --> less mistakes)
        # Tuple contains all alleles found in the sample (with 0.1 < AF < 0.9 OR called by GATK)
        
        # DEBUG
        """
        if rec.pos == 54641 :
            print(k, allAlleles[k])
        """
        
        if len(allAlleles[k]) == 0 or None in sm.alleles : # If no alleles found or uncalled genotype
            sm["TP"] = "UC" # TYPE = Uncalled
        else : # At least one allele found
            if len(allAlleles[k]) == 1 : # Mono-allelic
                if allAlleles[k][0] == rec.ref :
                    sm["TP"] = "RF" # TYPE = Hom Ref 
                else :
                    if len(allAlleles[k][0]) == 1 and allAlleles[k][0] != "*" :
                        sm["TP"] = "AS" # TYPE = Alternative SNV
                    else :
                        if allAlleles[k][0] == "<NON_REF>" :
                            sm["TP"] = "AU" # TYPE = Alternative UNKNOWN
                        else :
                            sm["TP"] = "AI" # Alternative INDEL
            elif len(allAlleles[k]) == 2 : # TYPE = Bi-allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "BS" # Bi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "BU" # TYPE = Bi-allelic UNKNOWN
                else :
                    sm["TP"] = "BI" # TYPE = Bi-allelic INDEL
            else : # Multi-Allelic
                if all(len(x) == 1 for x in allAlleles[k]) and "*" not in allAlleles[k] :
                    sm["TP"] = "MS" # TYPE = Multi-allelic SNV
                elif "<NON_REF>" in allAlleles[k] :
                    sm["TP"] = "MU" # TYPE = Multi-allelic UNKNOWN
                else :
                    sm["TP"] = "MI" # TYPE = Multi-allelic INDEL
    
    # DEBUG
    """
    try :
        Anc = rec.samples[ancestor]["TP"]
    except :
        print("NO TP")
        print(allAlleles)
        print(n, rec)

    try :
        AncAl = allAlleles[ancestor]
    except :
        print("NO KEYS")
        print(allAlleles)
        print(n, rec)
    """
    
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(allAlleles)
        print(n, rec)
    """
    
    # Add tag GET to each sample
    typeDict = {"UC":0, "AU":0, "BU":0, "MU":0, "RF":1, "AS":1, "AI": 1, "BI":2, "BS":2, "MI":3, "MS":3}
    AncAl = allAlleles[ancestor]
    Anc = rec.samples[ancestor]["TP"]
    A = typeDict[Anc]
    for k, sm in rec.samples.items() :
        if sm.name == ancestor :
            continue
        else :
            OffAl = allAlleles[k]
            Off = sm["TP"]
            O = typeDict[Off]
            
            intersect = set(AncAl) & set(OffAl)
            NumShared = len(intersect)
            AllShared = bool(NumShared == len(set(AncAl))) # If all ancestors alleles are found in the offspring (and vice-versa)
            NoneShared = bool(NumShared == 0) # If no common alleles
            
            if A == 0 or O == 0 :
                sm["GET"] = "U"
            elif A == 1 and O == 1 :
                if AllShared : # Same allele --> A or 0
                    if Off == "RF" : # Ref --> 0
                        sm["GET"] = "0"
                    else : # Not ref --> A
                        sm["GET"] = "A" 
                else : # Different allele --> S1
                    sm["GET"] = "S1"
            elif A == 1 and O == 2 : # Offspring heterozygous
                if NoneShared : # No common allele --> S4
                    sm["GET"] = "S4"
                else : # 1 shared (cannot be more than one because 2 = biallelic) --> N
                    sm["GET"] = "N"
            elif A == 1 and O == 3 : # Offspring multi-allelic --> always suspect S4
                sm["GET"] = "S4"
            elif A == 2 and O == 1 : # Ancestor heterozygous and offspring homozygous
                if NoneShared : # No allele shared --> suspect
                    sm["GET"] = "S3"
                else : # 1 allele in common --> cannot be more
                    sm["GET"] = "L"
            elif A == 2 and O == 2 : # both heterozygous
                if NoneShared : # No common allele --> S2
                    sm["GET"] = "S2"
                else : 
                    if NumShared == 1 : # One in common
                        sm["GET"] = "M"
                    else : # Two in common --> identical site <-> AllShared
                        sm["GET"] = "C"
            elif A == 2 and O == 3 : # Ancestor biallelic het and offspring multiallelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S2" 
                else :
                    if NumShared == 1 : # 1 common allele --> more than 2 events involved : suspect
                        sm["GET"] = "S2"
                    else : # 2 common alleles --> Increased heterozygosity
                        sm["GET"] = "I"
            elif A == 3 and O == 1 : # Ancestor multi-allelic and offspring homozygous
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3" 
                else : # 1 shared --> lost heterozygosity
                    sm["GET"] = "L"
            elif A == 3 and O == 2 : # Ancestor multi-allelic and offspring bi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    sm["GET"] = "D"
            elif A == 3 and O == 3 : # Both multi-allelic
                if NoneShared : # No common allele --> More than 2 events involved : suspect
                    sm["GET"] = "S3"
                else : # 1+ shared
                    if AllShared :
                        sm["GET"] = "C"
                    else :
                        if len(AncAl) > len(OffAl) : # More alleles in ancestor : Decreased Het
                            sm["GET"] = "D"
                        else : # More alleles in offspring : Increased Het
                            sm["GET"] = "I"
            else : # Should not come here
                # DEBUG
                print(n, allAlleles)
                print(k, rec)
                print("----")
                
                pass
    
    # Add UNCALLED samples to the dictionary for later sharedness
    for sample in vcf_out.header.samples :
        if sample not in allAlleles.keys() :
            allAlleles[sample] = tuple([])
        
    for k, sm in rec.samples.items() :
        # For each samples count which samples have the same alleles tuple --> same variants found at same position 
        sm["SH"] = sum(al == allAlleles[k] for al in allAlleles.values()) - 1 # minus 1 because do not count itself
        
    # Create a list with the number of times an allele is shared
    # When an allele is found in a sample the cAlleles key is incremented by 1 
    sh = []
    for al in rec.alleles :
        sh.append(cAlleles[al])
    rec.info["SH"] = sh # Set the SH tag in the current record for later output to VCF
    
    # Write out the record
    try :
        vcf_out.write(rec)
    except :
        print("Here: ", n)
        print(rec, "\n")
        
    # DEBUG
    """
    if rec.pos == 1603103 :
        print(rec)
    """
    
    # DEBUG
    """
    if len(rec.alleles) > 1 : # DEBUGGING
        try :
            out = "\n".join("Sample({}: {} {} {})".format(k, "/".join(str(x) for x in s["GT"]), s["SH"], s["TP"]) for k, s in rec.samples.items())
        except :
            print(rec)
    """

vcf_in.close()
vcf_out.close()

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (b

Elapsed records: 66000000
Elapsed records: 67000000
Elapsed records: 68000000
Elapsed records: 69000000
Elapsed records: 70000000
Elapsed records: 71000000
Elapsed records: 72000000
Elapsed records: 73000000
Elapsed records: 74000000
Elapsed records: 75000000
Elapsed records: 76000000
Elapsed records: 77000000
Elapsed records: 78000000
Elapsed records: 79000000
Elapsed records: 80000000
Elapsed records: 81000000
Elapsed records: 82000000
Elapsed records: 83000000
Elapsed records: 84000000
Elapsed records: 85000000
Elapsed records: 86000000
Elapsed records: 87000000
Elapsed records: 88000000
Elapsed records: 89000000
Elapsed records: 90000000
Elapsed records: 91000000
Elapsed records: 92000000
Elapsed records: 93000000
Elapsed records: 94000000
Elapsed records: 95000000
Elapsed records: 96000000
Elapsed records: 97000000
Elapsed records: 98000000
Elapsed records: 99000000
Elapsed records: 100000000
CPU times: user 11h 25min 39s, sys: 3min 15s, total: 11h 28min 54s
Wall time: 11h 29min 2