In [60]:
import pandas as pd
import plotly.express as px
import plotly as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [5]:
df = pd.read_csv("BR2TEST.csv")
df.drop(columns=["Unnamed: 0"],inplace=True)

In [8]:
df["PCT_SNV"] = df["VARIANT_SNV"] * 100 / df["NB_total_variant"] 
df["PCT_INDEL"] = df["VARIANT_INDEL"] * 100 / df["NB_total_variant"] 
df["PCT_Substitution"] = df["VARIANT_Substitution"] * 100 / df["NB_total_variant"] 
df["PCT_Insertion"] = df["VARIANT_Insertion"] * 100 / df["NB_total_variant"] 
df["PCT_Deletion"] = df["VARIANT_Deletion"] * 100 / df["NB_total_variant"] 
df["PCT_Others"] = df["VARIANT_Others"] * 100 / df["NB_total_variant"] 


df["TOTIMPAC"] = df['IMPACT_Modifier'] +  df['IMPACT_Low']+ df['IMPACT_Moderate']+ df['IMPACT_HIGH']
df["PCT_IMPACT_Modifier"] = df["IMPACT_Modifier"] * 100 / df["TOTIMPAC"] 
df["PCT_IMPACT_Low"] = df["IMPACT_Low"] * 100 / df["TOTIMPAC"]
df["PCT_IMPACT_Moderate"] = df["IMPACT_Moderate"] * 100 / df["TOTIMPAC"] 
df["PCT_IMPACT_HIGH"] = df["IMPACT_HIGH"] * 100 / df["TOTIMPAC"]

df["TOTSIFT"] = df['SIFT_deletarious'] +  df['SIFT_not_deletarious']
df["PCT_SIFT_DELETARIOUS"] = df["SIFT_deletarious"] * 100 / df["TOTSIFT"]
df["PCT_SIFT_NOT-DELETARIOUS"] = df["SIFT_not_deletarious"] * 100 / df["TOTSIFT"]

df["TOTPOLY"] = df['POLYPHEN_benign'] +  df['POLYPHEN_possibly_damaging'] +  df['POLYPHEN_probably_damaging']
df["PCT_POLY_benign"] = df["POLYPHEN_benign"] * 100 / df["TOTPOLY"]
df["PCT_POLY_possibly-damaging"] = df["POLYPHEN_possibly_damaging"] * 100 / df["TOTPOLY"]
df["PCT_POLYP_probably-damaging"] = df["POLYPHEN_probably_damaging"] * 100 / df["TOTPOLY"]


df["TOTGERP"] = df['GERP_constrained'] +  df['GERP_not_constrained']
df["PCT_GERP_constrained"] = df["GERP_constrained"] * 100 / df["TOTGERP"]
df["PCT_GERP_not-constrained"] = df["GERP_not_constrained"] * 100 / df["TOTGERP"]

df["TOTCADD"] = df['CADD_deletarious'] +  df['CADD_not_deletarious']
df["PCT_CADD_deletarious"] = df["CADD_deletarious"] * 100 / df["TOTCADD"]
df["PCT_CADD_not-deletarious"] = df["CADD_not_deletarious"] * 100 / df["TOTCADD"]

In [10]:
df.rename(columns={"Coverage": "Type"},inplace=True)

In [14]:
df.NB_total_variant

0    16489220
1    15711935
2    18639327
3    16483122
4    18639328
5    15708493
Name: NB_total_variant, dtype: int64

# Insight on the total number of Variant

In [21]:
fig = px.scatter(df, x="Method", y="NB_total_variant", color = "Type", symbol = "Type",
                 title="NB total of variants for each methods and each type")

fig.update_traces(mode='markers', marker_line_width=2, marker_size=20, opacity=0.5)

fig.show()

The amount of variants doesn't much, which makes sense because the same VCF are used and only a few variant got removed in the normalization step

# Insight on the proporition for each TYPE OF VARIANTS

In [55]:
labels = ['SNV', 'INDEL', 'SUBSTITUTION', 'INSERTION', 'DELETION', 'OTHERS']

In [56]:
specs = [[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]

In [57]:
val_bwaOG = df[(df["Method"] == "BWA") & (df["Type"]=="original")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()
val_vgbamOG = df[(df["Method"] == "VGbam") & (df["Type"]=="original")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()
val_vggamOG = df[(df["Method"] == "VGgam") & (df["Type"]=="original")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()
val_bwaNORM = df[(df["Method"] == "BWA") & (df["Type"]=="normalized")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()
val_vgbamnorm = df[(df["Method"] == "VGbam") & (df["Type"]=="normalized")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()
val_vggamnorm = df[(df["Method"] == "VGgam") & (df["Type"]=="normalized")][["VARIANT_SNV","VARIANT_INDEL", "VARIANT_Substitution", "VARIANT_Insertion", "VARIANT_Deletion", "VARIANT_Others"]].values[0].tolist()


In [116]:
fig = make_subplots(2, 3, specs=specs,
                    subplot_titles=['BWA original','VGbam original','VGgam original', 'BWA normalized', 'VGbam normalized', 'VGgam normalized'])

                     
fig.add_trace(go.Pie(labels=labels, values=val_bwaOG, scalegroup='one',
                     name="BWA original"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamOG, scalegroup='one',
                     name="VGbam original"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamOG, scalegroup='one',
                     name="VGgam original"), 1, 3)
fig.add_trace(go.Pie(labels=labels, values=val_bwaNORM, scalegroup='one',
                     name="BWA normalized"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamnorm, scalegroup='one',
                     name="VGbam normalized"), 2, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamnorm, scalegroup='one',
                     name="VGgam normalized"), 2, 3)

fig.layout.annotations[0].update(y=0.58)
fig.layout.annotations[1].update(y=0.58)
fig.layout.annotations[2].update(y=0.58)
fig.layout.annotations[3].update(y=-0.05)
fig.layout.annotations[4].update(y=-0.05)
fig.layout.annotations[5].update(y=-0.05)


fig.update_layout(title_text='Variant type of each methods and type')
fig.update_layout(title_x=0.45)
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000)
fig.show()



Here we can see the differences observed previously was probably due tothe fact that VGcall normalize the VCF and FreeBayes doesn't. 
The numbers are now more comparable.

# Insight on IMPACT

In [120]:
labels = ['Modifier', 'Low', 'Moderate', 'High']
specs = [[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]

In [121]:
val_bwaOG = df[(df["Method"] == "BWA") & (df["Type"]=="original")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()
val_vgbamOG = df[(df["Method"] == "VGbam") & (df["Type"]=="original")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()
val_vggamOG = df[(df["Method"] == "VGgam") & (df["Type"]=="original")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()
val_bwaNORM = df[(df["Method"] == "BWA") & (df["Type"]=="normalized")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()
val_vgbamnorm = df[(df["Method"] == "VGbam") & (df["Type"]=="normalized")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()
val_vggamnorm = df[(df["Method"] == "VGgam") & (df["Type"]=="normalized")][["IMPACT_Modifier","IMPACT_Low", "IMPACT_Moderate", "IMPACT_HIGH"]].values[0].tolist()


In [122]:
fig = make_subplots(2, 3, specs=specs,
                    subplot_titles=['BWA original','VGbam original','VGgam original', 'BWA normalized', 'VGbam normalized', 'VGgam normalized'])

                     
fig.add_trace(go.Pie(labels=labels, values=val_bwaOG, scalegroup='one',
                     name="BWA original"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamOG, scalegroup='one',
                     name="VGbam original"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamOG, scalegroup='one',
                     name="VGgam original"), 1, 3)
fig.add_trace(go.Pie(labels=labels, values=val_bwaNORM, scalegroup='one',
                     name="BWA normalized"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamnorm, scalegroup='one',
                     name="VGbam normalized"), 2, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamnorm, scalegroup='one',
                     name="VGgam normalized"), 2, 3)

fig.layout.annotations[0].update(y=0.58)
fig.layout.annotations[1].update(y=0.58)
fig.layout.annotations[2].update(y=0.58)
fig.layout.annotations[3].update(y=-0.05)
fig.layout.annotations[4].update(y=-0.05)
fig.layout.annotations[5].update(y=-0.05)


fig.update_layout(title_text='IMPACT of the variants for each methods and type')
fig.update_layout(title_x=0.45)
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000)
fig.show()



IMPACT and others metrics didn't change significantly. However it's seems important to normalize all the other VCF in order to find the correct insight about which variant TYPE does impact the most our individuals

# Insight SIFT

In [123]:
df.columns

Index(['Sample', 'Method', 'Type', 'Age', 'NB_total_variant', 'VARIANT_SNV',
       'VARIANT_INDEL', 'VARIANT_Substitution', 'VARIANT_Insertion',
       'VARIANT_Deletion', 'VARIANT_Others', 'IMPACT_Modifier', 'IMPACT_Low',
       'IMPACT_Moderate', 'IMPACT_HIGH', 'SIFT_deletarious',
       'SIFT_not_deletarious', 'POLYPHEN_unknown', 'POLYPHEN_benign',
       'POLYPHEN_possibly_damaging', 'POLYPHEN_probably_damaging',
       'GERP_constrained', 'GERP_not_constrained', 'CADD_deletarious',
       'CADD_not_deletarious', 'PCT_SNV', 'PCT_INDEL', 'PCT_Substitution',
       'PCT_Insertion', 'PCT_Deletion', 'PCT_Others', 'TOTIMPAC',
       'PCT_IMPACT_Modifier', 'PCT_IMPACT_Low', 'PCT_IMPACT_Moderate',
       'PCT_IMPACT_HIGH', 'TOTSIFT', 'PCT_SIFT_DELETARIOUS',
       'PCT_SIFT_NOT-DELETARIOUS', 'TOTPOLY', 'PCT_POLY_benign',
       'PCT_POLY_possibly-damaging', 'PCT_POLYP_probably-damaging', 'TOTGERP',
       'PCT_GERP_constrained', 'PCT_GERP_not-constrained', 'TOTCADD',
       'PCT_CADD_del

In [124]:
labels = ['DELETARIOUS', 'NOT DELETARIOUS']
specs = [[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]

In [None]:
val_bwaOG = df[(df["Method"] == "BWA") & (df["Type"]=="original")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()
val_vgbamOG = df[(df["Method"] == "VGbam") & (df["Type"]=="original")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()
val_vggamOG = df[(df["Method"] == "VGgam") & (df["Type"]=="original")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()
val_bwaNORM = df[(df["Method"] == "BWA") & (df["Type"]=="normalized")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()
val_vgbamnorm = df[(df["Method"] == "VGbam") & (df["Type"]=="normalized")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()
val_vggamnorm = df[(df["Method"] == "VGgam") & (df["Type"]=="normalized")][["SIFT_deletarious","SIFT_not_deletarious"]].values[0].tolist()


In [125]:
fig = make_subplots(2, 3, specs=specs,
                    subplot_titles=['BWA original','VGbam original','VGgam original', 'BWA normalized', 'VGbam normalized', 'VGgam normalized'])

                     
fig.add_trace(go.Pie(labels=labels, values=val_bwaOG, scalegroup='one',
                     name="BWA original"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamOG, scalegroup='one',
                     name="VGbam original"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamOG, scalegroup='one',
                     name="VGgam original"), 1, 3)
fig.add_trace(go.Pie(labels=labels, values=val_bwaNORM, scalegroup='one',
                     name="BWA normalized"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=val_vgbamnorm, scalegroup='one',
                     name="VGbam normalized"), 2, 2)
fig.add_trace(go.Pie(labels=labels, values=val_vggamnorm, scalegroup='one',
                     name="VGgam normalized"), 2, 3)

fig.layout.annotations[0].update(y=0.58)
fig.layout.annotations[1].update(y=0.58)
fig.layout.annotations[2].update(y=0.58)
fig.layout.annotations[3].update(y=-0.05)
fig.layout.annotations[4].update(y=-0.05)
fig.layout.annotations[5].update(y=-0.05)


fig.update_layout(title_text='SIFT of the variants for each methods and type')
fig.update_layout(title_x=0.45)
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000)
fig.show()

