### CONFIGURATION

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import copy
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Hartwig imports
sys.path.append("/Users/peterpriestley/hmf/repos/scripts/analysisscripts") 
import analyseVCF as aVCF
import venn as vn

In [3]:
#CHROM SLICING
minChromFrac = 0#17.443476
maxChromFrac = 26#17.491846
#minChromFrac = 17.689695
#maxChromFrac = 17.73896

In [4]:

# COMBINED VCF CONFIG
VCF_SAMPLE = "CPCT02010277"
VCF_PATH = "/Users/peterpriestley/hmf/analyses/MerckHMFPatientComparison/"
VCF_FILE_NAME = VCF_SAMPLE + "R_"+ VCF_SAMPLE + "T_merged_somatics_snpEff_dbSNP_Cosmicv76_melted.vcf"
SAMPLE_NAMES = {'/sample/output/160526_HMFreg0049_FR10303439_FR10303440_CPCT02010277/somaticVariants/CPCT02010277R_CPCT02010277T/CPCT02010277R_CPCT02010277T_merged_somatics_snpEff_dbSNP_Cosmicv76':'melted'}
# TRUTH SET
VCF_SAMPLE_TRUTH = VCF_SAMPLE
SAMPLE_NAMES_TRUTH = {VCF_SAMPLE + 'T':'truth'}
VCF_PATH_TRUTH = VCF_PATH
VCF_FILE_NAME_TRUTH = VCF_SAMPLE + ".mutect.cutoff.ann.vcf"


### Functions

In [5]:
def filterByChromFrac(df):
    return df[(df.chromFrac > minChromFrac)&(df.chromFrac < maxChromFrac)]

In [6]:
def calculateTruth(df,dfTruth):
    df = pd.merge(df,dfTruth,how='left', left_index=True,right_index=True,suffixes=('', '_Truth'))
    df['hasTP'] = False
    df['hasFP'] = False
    for columnName in list(df):
        if columnName.endswith('allele'):
            df['hasTP'] = (df['hasTP']) | ((df[columnName[:-6]+'indelDiff'] == df['truth_indelDiff']) \
                    & (~pd.isnull(df['truth_indelDiff']) & (df['variantType'] == 'INDEL'))) |((df[columnName] == df['truth_allele']) \
                    & (df['variantType'] == 'SNP'))
            df['hasFP'] = (df['hasFP']) | ((df[columnName[:-6]+'indelDiff'] != df['truth_indelDiff']) \
                    & (df['variantType'] == 'INDEL') & (df[columnName[:-6]+'indelDiff'] != '')) |((df[columnName] != df['truth_allele']) \
                    & (df['variantType'] == 'SNP') & (df[columnName] != ''))
    df['Truth'] = (df['hasTP']) &  (df['hasFP'] == False)
    return df

In [7]:
def calcuatePrecisionSensivityMatrix(df):
    outputdata = []
    for columnName in list(df):
        if columnName.endswith('allele') & ~columnName.endswith('_allele'):
            myCaller = columnName[:-6]
            variantTypes = df[(df[myCaller+'allele'] != '')].variantType.unique()
            for variantType in variantTypes:
                truePositives = len(df[(df[myCaller+'allele'] != '') & (df['Truth'] == True) &(df['variantType'] == variantType)])
                positives = len(df[(df[myCaller+'allele'] != '')&(df['variantType'] == variantType)])
                truthSet = len(dfTruth[dfTruth['variantType'] == variantType]) 
                falseNegatives = truthSet - truePositives
                if positives > 0:
                    outputdata.append([variantType, myCaller, truthSet,truePositives,positives-truePositives, falseNegatives, \
                                   round(truePositives/float(positives),4),round(truePositives/float(max(truthSet,1)),4)])
    
    outputDF = pd.DataFrame(outputdata)
    outputDF.columns = (['variantType','caller','truthSet','truePositives','falsePositives','falseNegatives','precision','sensitivity'])
    return outputDF.sort_values(['variantType','caller'])

<h3> Load VCFs and Prepare DF

In [8]:
## LOAD TRUTH SET VCF
dfTruth = aVCF.loadVaraintsFromVCF(VCF_PATH_TRUTH,VCF_FILE_NAME_TRUTH, \
                                   SAMPLE_NAMES_TRUTH,"Mix-in Truth Set",True)
dfTruth = filterByChromFrac(dfTruth)
dfTruth.rename(columns={'truthallele':'truth_allele','truthindelDiff':'truth_indelDiff','truthindelPos':'truth_indelPos'}, inplace=True)
dfTruth = dfTruth[['chrom','pos','variantType','ref','truth_allele','truth_indelDiff','truth_indelPos']]
dfTruth = dfTruth.set_index(['chrom','pos'])

reading vcf file: CPCT02010277.mutect.cutoff.ann.vcf
reading VCF File line: 100000
Number variants loaded: 109762


In [9]:
# LOAD SAMPLE VCF + match to truth set
dfProd = aVCF.loadVaraintsFromVCF(VCF_PATH,VCF_FILE_NAME,SAMPLE_NAMES,VCF_SAMPLE,True)
dfProd = filterByChromFrac(dfProd)
dfProd = dfProd.set_index(['chrom','pos'])
dfProd = calculateTruth(dfProd,dfTruth)

reading vcf file: CPCT02010277R_CPCT02010277T_merged_somatics_snpEff_dbSNP_Cosmicv76_melted.vcf
reading VCF File line: 100000
reading VCF File line: 200000
Number variants loaded: 263910


In [10]:
dfProd['dbSNP'] = (dfProd['id'].str.contains('rs'))
dfProd['COSMIC'] = (dfProd['id'].str.contains('COSM'))

<h3> SLICE ANALYSIS

### PRECISION + SENSITIVITY|

In [11]:
#PRECISION + SENSITIVITY by caller - PROD
outputDF = calcuatePrecisionSensivityMatrix(dfProd)
outputDF

Unnamed: 0,variantType,caller,truthSet,truePositives,falsePositives,falseNegatives,precision,sensitivity
1,INDEL,melted,0,0,8120,0,0.0,0.0
0,SNP,melted,109762,109759,146031,3,0.4291,1.0


In [19]:
dfProd[(dfProd['variantType']== "SNP")&(dfProd['dbSNP']==False)&(dfProd['vennSegment'].str.contains('mutect'))]

Unnamed: 0_level_0,Unnamed: 1_level_0,chromPos,chromFrac,id,ref,vennSegment,numCallers,variantType,variantSubType,filter,meltedallele,meltedAF,meltedDP,meltedQS,meltedSGT,meltedindelDiff,meltedindelPos,patientName,variantType_Truth,ref_Truth,truth_allele,truth_indelDiff,truth_indelPos,hasTP,hasFP,Truth,dbSNP,COSMIC
chrom,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1,100029126,1:100029126,1.401319,.,A,filterInvarscan-mutect,1,SNP,,PASS,T,0.095238,42,-1.0,0/1,,-1,CPCT02010277,SNP,A,T,,-1.0,True,False,True,False,False
1,100029272,1:100029272,1.401320,.,C,mutect,1,SNP,,PASS,T,0.058824,69,-1.0,0/1,,-1,CPCT02010277,SNP,C,T,,-1.0,True,False,True,False,False
1,100061953,1:100061953,1.401451,.,G,filterInvarscan-mutect,1,SNP,,PASS,T,0.129032,30,-1.0,0/1,,-1,CPCT02010277,SNP,G,T,,-1.0,True,False,True,False,False
1,100132059,1:100132059,1.401732,.,C,mutect,1,SNP,,PASS,A,0.086957,23,-1.0,0/1,,-1,CPCT02010277,,,,,,False,True,False,False,False
1,10015481,1:10015481,1.040182,.,G,mutect,1,SNP,,PASS,A,0.085366,83,-1.0,0/1,,-1,CPCT02010277,SNP,G,A,,-1.0,True,False,True,False,False
1,100178808,1:100178808,1.401920,.,C,strelka-mutect,1,SNP,,PASS,A,0.304348,23,-1.0,0/1,,-1,CPCT02010277,SNP,C,A,,-1.0,True,False,True,False,False
1,10022852,1:10022852,1.040212,.,G,strelka-mutect,1,SNP,,PASS,A,0.087500,166,-1.0,0/1,,-1,CPCT02010277,SNP,G,A,,-1.0,True,False,True,False,False
1,100245809,1:100245809,1.402189,.,C,strelka-varscan-mutect,1,SNP,,PASS,A,0.541667,24,-1.0,0/1,,-1,CPCT02010277,SNP,C,A,,-1.0,True,False,True,False,False
1,100262420,1:100262420,1.402255,.,T,strelka-mutect,1,SNP,,PASS,G,0.086207,59,-1.0,0/1,,-1,CPCT02010277,SNP,T,G,,-1.0,True,False,True,False,False
1,100263486,1:100263486,1.402260,.,A,strelka-mutect,1,SNP,,PASS,G,0.070423,71,-1.0,0/1,,-1,CPCT02010277,SNP,A,G,,-1.0,True,False,True,False,False


In [17]:
dftemp = dfProd[(dfProd['variantType']== "SNP")&(dfProd['vennSegment'].str.contains('mutect'))].reset_index()
pd.pivot_table(dftemp, values='pos', index=['vennSegment'], columns=['Truth','dbSNP','COSMIC'], aggfunc='count')

Truth,False,False,False,False,True,True,True,True
dbSNP,False,False,True,True,False,False,True,True
COSMIC,False,True,False,True,False,True,False,True
vennSegment,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
filterInvarscan-freebayes-mutect,5.0,,4.0,,25.0,,,
filterInvarscan-mutect,380.0,,106.0,1.0,839.0,,13.0,
freebayes-mutect,25.0,,73.0,,121.0,,5.0,
mutect,18050.0,5.0,5398.0,15.0,24092.0,11.0,398.0,3.0
strelka-filterInvarscan-freebayes-mutect,3.0,,7.0,,90.0,,,
strelka-filterInvarscan-mutect,71.0,,16.0,,319.0,,3.0,
strelka-freebayes-mutect,27.0,,23.0,,1245.0,1.0,36.0,
strelka-mutect,4217.0,1.0,1346.0,2.0,27461.0,10.0,459.0,5.0
strelka-varscan-mutect,1509.0,1.0,693.0,,38516.0,13.0,676.0,2.0
varscan-freebayes-mutect,2.0,,12.0,,25.0,,,


In [None]:
#Alllele Freq By Caller
snpdf = dfProd[(dfProd.variantType == 'SNP')]
for columnName in list(snpdf):
    if columnName.endswith('allele'):
        ser = snpdf[(snpdf[columnName] != '') &(snpdf[columnName[:-6] + 'AF'] > 0.0)][columnName[:-6] + 'AF']
        ser = ser.sort_values()
        #ser[len(ser)] = ser.iloc[-1]
        cum_dist = np.linspace(0.,1.,len(ser))
        ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+": count="+str(ser.count())+" med="+str(round(ser.median(),2)))
        ser_cdf.plot(drawstyle='steps',legend=True,title=VCF_SAMPLE+" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[10,5])

In [None]:
#INTERNAL PRECISION + SENSITIVITY by caller - OLD
#dfTruth['Truth'] = (dfTruth['numCallers'] >= 2)
#outputDF = calcuatePrecisionSensivityMatrix(dfTruth,dfTruth)
#outputDF

In [None]:
dfProd.head(10)

In [None]:
snpdf = dfProd[(dfProd.variantType == 'SNP')&(dfProd.ref.str.len() ==1)]
pd.pivot_table(snpdf, values='pos', index=['ref'], columns=['mutectallele'], aggfunc='count')

In [None]:
#dfTruth[(dfTruth['freebayesallele'] != '') & (dfTruth.variantType == 'SNP')]

In [None]:
#pd.options.display.max_rows = 200
#caller = "freebayes"
#dfTruth[(dfTruth[caller+'allele'] != '') & (dfTruth.variantType == 'SNP')][['ref',caller+'indelDiff',caller+'QS',caller+'AF','numCallers']]

In [None]:
#pd.options.display.max_rows = 15000
#caller = "varscan"
#dfProd[(True) &(dfProd[caller+'allele'] != '') & (dfProd.variantType == 'INDEL')][['Truth','ref',caller+'indelDiff',caller+'QS',caller+'AF','numCallers']]

## False Positive Analysis

In [None]:
unfilteredBed = copy.deepcopy(bed)

In [None]:
# SETTINGS
myCaller = 'varscan'
UNFILTERED_VCF_PATH = "/Users/peterpriestley/hmf/70-30slice/"
if myCaller == 'varscan':
    UNFILTERED_VCF_FILE_NAME = "varscan.snp.vcf"
    UNFILTERED_SAMPLE_NAMES = {"TUMOR":myCaller}
elif myCaller == 'freebayes':
    UNFILTERED_VCF_FILE_NAME = "freebayes.somatic.vcf"
    UNFILTERED_SAMPLE_NAMES = {'CPCT11111111T':myCaller}  
elif myCaller == 'strelka':
    UNFILTERED_VCF_FILE_NAME = "/strelka/results/all.somatic.snvs.vcf"
    UNFILTERED_SAMPLE_NAMES = {"TUMOR":myCaller}
elif myCaller == 'mutect':
    UNFILTERED_VCF_FILE_NAME = "mutect.vcf"
    UNFILTERED_SAMPLE_NAMES = {'CPCT11111111T':myCaller}


dfUnfiltered = aVCF.loadVaraintsFromVCF(UNFILTERED_VCF_PATH,UNFILTERED_VCF_FILE_NAME,UNFILTERED_SAMPLE_NAMES,VCF_SAMPLE,False,True,unfilteredBed)
dfUnfiltered['Truth']=dfUnfiltered.chromPos.isin(dfTruth['chromPos']) 
dfUnfiltered = dfUnfiltered[(dfUnfiltered.chromFrac > minChromFrac)&(dfUnfiltered.chromFrac < maxChromFrac)]

In [None]:
dfTruth['inUnfiltered'+myCaller]=dfTruth.chromPos.isin(dfUnfiltered[dfUnfiltered[myCaller+'allele'] != '']['chromPos'])
dfTruth['inSlice']=dfTruth.chromPos.isin(dfSlice['chromPos'])

In [None]:
#False Positives - pre-somatic
dfTruth[['inSlice','pos','inUnfiltered'+myCaller,'variantType']].groupby(['variantType','inUnfiltered'+myCaller]).agg('count')


In [None]:
dfUnfiltered['inSlice'+myCaller]=dfUnfiltered.chromPos.isin(dfSlice[dfSlice[myCaller+'allele'] != '']['chromPos'])
dfUnfiltered['inSlice']=dfUnfiltered.chromPos.isin(dfSlice['chromPos'])

In [None]:
dfUnfiltered[['inSlice'+myCaller,'pos','Truth','variantType','filter']].groupby(['variantType','inSlice'+myCaller,'Truth','filter']).agg('count')


In [None]:

dfUnfiltered[(dfUnfiltered['Truth'] == True)&
            (dfUnfiltered['inSlice'+myCaller] == True)& 
             (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['filter'] != 'PETE')]['pos']

In [None]:
#FALSE NEGATIVES IN UNFILTERED
dfUnfiltered[(dfUnfiltered['inSlice'+myCaller] == False)& 
             (dfUnfiltered[myCaller+'somaticGT'] != '2')&
             (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['filter'] != 'str10')].sort_values([myCaller+'QS'],ascending=False)

In [None]:
dfSlice[dfSlice.pos>'37577400'].head(10)

In [None]:
#FALSE POSITIVES`
dfUnfiltered[(dfUnfiltered['inSlice'+myCaller] == True) & (dfUnfiltered.variantType == 'SNP') &
             (dfUnfiltered['Truth'] == False)].sort_values([myCaller+'QS'],ascending=False)

<h3> SNP

In [None]:
snpdf = dfProd[(dfProd.variantType == 'SNP')]
vn.venn([snpdf[snpdf.mutectallele != '']['chromPos'], \
         snpdf[snpdf.strelkaallele != '']['chromPos'], \
        snpdf[snpdf.freebayesallele != '']['chromPos'], \
        snpdf[snpdf.varscanallele != '']['chromPos'] \
        ],['mutect','strelka','freebayes','varscan'],figsize=(6,6))

In [None]:
snpdf = dfProd[(dfTruth.variantType == 'SNP')]
vn.venn([snpdf[snpdf.mutectallele != '']['chromPos'], \
         snpdf[snpdf.strelkaallele != '']['chromPos'], \
        snpdf[snpdf.freebayesallele != '']['chromPos'], \
        snpdf[snpdf.varscanallele != '']['chromPos'] \
        ],['mutect','strelka','freebayes','varscan'],figsize=(6,6))

### <hd3> Indel Venn

In [None]:
indeldf= dfProd[(dfProd.variantType == 'INDEL')]
vn.venn([indeldf[indeldf.strelkaallele != '']['chromPos'], \
        indeldf[indeldf.freebayesallele != '']['chromPos'], \
        indeldf[indeldf.varscanallele != '']['chromPos'] \
        ],['strelka','freebayes','varscan'],figsize=(6,6))

In [None]:
indeldf= dfTruth[(dfTruth.variantType == 'INDEL')]
vn.venn([indeldf[indeldf.strelkaallele != '']['chromPos'], \
        indeldf[indeldf.freebayesallele != '']['chromPos'], \
        indeldf[indeldf.varscanallele != '']['chromPos'] \
        ],['strelka','freebayes','varscan'],figsize=(6,6))

### <hd3> Read Depth

In [None]:
for columnName in list(df):
    if columnName.endswith('allelicFreq'):
        df[df[columnName] != ''].hist(column=columnName,by="Truth",bins=40,figsize=(10,5))
        

In [None]:
df[df.freebayesreadDepth != ''].hist(column="freebayesallelicFreq",by="Truth",bins=30)

## Qual Score

In [None]:
# INDELS BY CALLER BY TRUTH
caller = 'strelka'
truths = indeldf.Truth.unique()
for truth in truths:
    ser = indeldf[(indeldf.Truth == truth) &(indeldf[caller+'allele'] != '')][caller + 'QS']
    ser = ser.sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=caller+" INDEL "+str(truth)+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=caller+" Qual Score CDF - INDELS",figsize=[15,6], xlim=[0,100],ylim=[0,1])

truths = snpdf.Truth.unique()
for truth in truths:
    ser = snpdf[(snpdf.Truth == truth) &(snpdf[caller+'allele'] != '')][caller + 'QS']
    ser = ser.sort_values()
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=caller+" SNP "+str(truth)+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=caller+"Qual Score CDF",figsize=[15,6],xlim=[0,100],ylim=[0,1])

<h3> Allelic Depth

In [None]:
#Alllele Freq By Caller
for columnName in list(snpdf):
    if columnName.endswith('allele'):
        ser = snpdf[(snpdf[columnName] != '') &(snpdf[columnName[:-6] + 'AF'] > 0.0)][columnName[:-6] + 'AF']
        ser = ser.sort_values()
        #ser[len(ser)] = ser.iloc[-1]
        cum_dist = np.linspace(0.,1.,len(ser))
        ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+": count="+str(ser.count())+" med="+str(round(ser.median(),2)))
        ser_cdf.plot(drawstyle='steps',legend=True,title=VCF_SAMPLE+" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[10,5])

In [None]:
#Alllele Freq By Caller
for columnName in list(snpdf):
    if columnName.endswith('allele'):
        ser = snpdf[(snpdf[columnName] != '') &(snpdf[columnName[:-6] + 'AF'] > 0.0)][columnName[:-6] + 'AF']
        ser = ser.sort_values()
        #ser[len(ser)] = ser.iloc[-1]
        cum_dist = np.linspace(0.,1.,len(ser))
        ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
        ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller (AllelicFreq > 0.0)",figsize=[15,6])

In [None]:
# BY CALLER BY TRUTH
truths = snpdf.Truth.unique()
for truth in truths:
    for columnName in list(snpdf):
        if columnName.endswith('allele'):
            ser = snpdf[(snpdf.Truth == truth) &(snpdf[columnName] != '')][columnName[:-6] + 'allelicFreq']
            ser = ser.sort_values()
            #ser[len(ser)] = ser.iloc[-1]
            cum_dist = np.linspace(0.,1.,len(ser))
            ser_cdf = pd.Series(cum_dist, index=ser,name=columnName[:-6]+" "+str(truth)+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
            ser_cdf.plot(drawstyle='steps',legend=True,title=" Allelic Frequency by Caller and Truth",figsize=[15,6])

In [None]:
#  BY VENN SEGMENT - Single Caller - FALSE POSITIVES
myCaller = 'freebayes'
vennSegments = snpdf[(snpdf[myCaller+'allele'] != '') & (snpdf.Truth == False)].vennSegment.unique()
for vennSegment in vennSegments:
    ser = snpdf[(snpdf.vennSegment == vennSegment) & (snpdf[myCaller+'allele'] != '') & (snpdf.Truth == False) ][myCaller+'allelicFreq']
    ser = ser.sort_values()
    ser[len(ser)] = ser.iloc[-1]
    cum_dist = np.linspace(0.,1.,len(ser))
    ser_cdf = pd.Series(cum_dist, index=ser,name=vennSegment+": c="+str(ser.count())+" m="+str(round(ser.median(),2)))
    ser_cdf.plot(drawstyle='steps',legend=True,title=myCaller+" FP by Venn Segment",figsize=[15,6])

### Allelic Depth Scatter Plot

In [None]:
caller1 = 'varscan'
caller2 = 'strelka'
tempdf = df[(df[caller1+'allele']!='')&(df[caller2+'allele']!='')]
tempdf.head()
tempdf.plot.scatter(caller1+'allelicFreq',caller2+'allelicFreq' \
                        ,figsize=[6,6] \
                        ,title="Comparison of "+caller1+" and "+caller2+" Allelic Frequency" \
                        ,xlim=[0,1],ylim=[0,1])

<h3> RAINBOW CHARTS

In [None]:
## RAINBOW
caller = 'freebayes'
plt.scatter(df[(df[caller+'allele']!='')& (df['Truth'] == True)&(df['chrom'] == '22')]['chromFrac'], \
            df[(df[caller+'allele']!='')& (df['Truth'] == True)&(df['chrom'] == '22')][caller+'allelicFreq'], \
            s=10, c='b', marker="s")

In [None]:
## RAINBOW
caller = 'freebayes'
plt.scatter(df[(df[caller+'allele']!='')& (df['Truth'] == False)&(df['chrom'] == '22')]['chromFrac'], \
            df[(df[caller+'allele']!='')& (df['Truth'] == False)&(df['chrom'] == '22')][caller+'allelicFreq'], \
            s=10, c='b', marker="s")

<h3> Overlapping Analysis

In [None]:
#NEEDS REFACTORING TO SUPPORT NEW DF FORMAT
'''
sorteddf = df.sort(['chromFrac'], ascending=1)
sorteddf['nextRef'] = sorteddf['ref'].shift(-2)
sorteddf['nextPos'] = sorteddf['pos'].shift(-2)
sorteddf['nextChrom'] = sorteddf['chrom'].shift(-2)
sorteddf['nextVT'] = sorteddf['variantType'].shift(-2)
sorteddf['nextAlleleTumor1'] = sorteddf['alleleTumor1'].shift(-2)
sorteddf['nextAlleleTumor2'] = sorteddf['alleleTumor2'].shift(-2)
sorteddf['nextVennSegment'] = sorteddf['vennSegment'].shift(-2)
sorteddf['lengthRef'] = sorteddf['ref'].str.len()
sorteddf['Dist2Next'] = -sorteddf['pos'].astype(int).diff(-2)

overlapdf = sorteddf[(sorteddf['lengthRef']+0>sorteddf['Dist2Next']) & (sorteddf['Dist2Next']>0)]
overlapdf.head(20)
'''

In [None]:
#NEEDS REFACTORING TO SUPPORT NEW DF FORMAT
'''
filtereddf = overlapdf[(~overlapdf.vennSegment.isin(['freebayes','varscan','strelka1'])) \
                       & (overlapdf.vennSegment.str.contains('varscan')) \
                       #& (~overlapdf.nextVennSegment.str.contains('varscan')) \
                       & (overlapdf.variantType == 'INDEL') \
                       #& (overlapdf.variantSubType == 'DELETE')
                       & (overlapdf.variantType == overlapdf.nextVT)]
filtereddf[['nextVennSegment','vennSegment','variantType','nextVT','Dist2Next','lengthRef','ref','alleleTumor2', \
            'nextRef','nextAlleleTumor2','variantSubType','chrom','pos']].count()
'''