In [1]:
# Bioinformatics
import pysam
from pysam import VariantFile

# Data Analysis
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

# General
import sys
import os
import re

# Plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

mpl.rcParams['figure.dpi'] = 300

In [2]:
conditions_colors = {
    "30H": (0.0,0.0,1.0,0.8), # blue
    "30H_del": (0.0,0.7,1.0,0.25), # turquoise
    "30D": (1.0,0.5,0.0,0.8), # orangered
    "30D_del": (1.0,0.75,0.0,0.25), # orange
    "0Gy": (0.5,0.5,0.5,0.8), # dark grey
    "0Gy_del": (0.7,0.7,0.7,0.25), # lighter grey
    "100Gy": (0.7,0.0,0.6,0.8), # purple
    "100Gy_del": (1.0,0.0,0.8,0.25), # pinkish
    "250Gy": (0.0,0.75,0.0,0.8), # darkgreen
    "250Gy_del": (0.0,1.0,0.0,0.25), # limegreen
    "500Gy": (1.0,0.0,0.0,0.8), # red
    "500Gy_del": (1.0,0.5,0.5,0.25), # pink
    "H": (0.0,0.0,1.0,0.8),
    "H_del": (0.0,0.7,1.0,0.25),
    "D": (1.0,0.25,0.0,0.8),
    "D_del": (1.0,0.75,0.0,0.25),
}

conditions_cmap = {
    "30H":plt.cm.Blues,
    "30D":plt.cm.Oranges,
    "0Gy":plt.cm.Greys,
    "100Gy":plt.cm.Purples,
    "250Gy":plt.cm.Greens,
    "500Gy":plt.cm.Reds,
    "H":plt.cm.Blues,
    "D":plt.cm.Oranges,
}

conditions_names = {
    "30H": "GR Hydrated",
    "30D": "GR Desiccated",
    "0Gy": "GR 0Gy IR",
    "100Gy": "GR 100Gy IR",
    "250Gy": "GR 250Gy IR",
    "500Gy": "GR 500Gy IR",
    "H": "MA Hydrated",
    "D": "MA Desiccated"
}

conditions_shape = {
    "30H": "o",
    "30D": "o",
    "0Gy": "o",
    "100Gy": "o",
    "250Gy": "o",
    "500Gy": "o",
    "H": "d",
    "D": "d"
}

# COVERAGE
median_coverage = {
    "ancestor": 341,
    "H5A2": 54,
    "H5A2_50G": 63,
    "H5C2": 148,
    "P500_C18_E3":291,
}

col_MA = (55/255, 126/255, 184/255)
col_ARC = (255/255, 127/255, 0/255)
col_TE = (152/255, 78/255, 163/255, 0.65)

# LENGTHS OF CHROMOSOMES
reference = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/input/reference.fa"
fai = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/input/reference.fa.fai"
lengths = {}
for line in open(fai, 'r') :
    s = line.strip().split()
    lengths[s[0]] = int(s[1])

# TELOMERES
telomeres = {
    "Chrom_1":([0,1500000], [11000000, 18146847]),
    "Chrom_2":([0,2000000], [15000000, 16274841]),
    "Chrom_3":([0,4000000], [13500000, 20354777]),
    "Chrom_4":([0,1000000], [14000000, 15224634]),
    "Chrom_5":([0,5000000], [15500000, 16930519]),
    "Chrom_6":([0,2000000], [12000000, 13893210]),
}

# POSITIONS OF GENES, CDS AND REPEATS 
genes = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/input/Avaga.genes.bed"
cds = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/input/Avaga.CDS.bed"
repeats = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/input/avaga.TEannotation.26-02-20.defragmentedDraftAnnot.bed"

# REPEATS DATAFRAME
reps = {"CHR":[], "START":[], "END":[]}
for line in open(repeats, "r") :
    s = line.strip().split("\t")
    reps["CHR"].append(s[0])
    reps["START"].append(int(s[1]))
    reps["END"].append(int(s[2]))
    
repdf = pd.DataFrame().from_dict(reps)

In [3]:
ccycle = list(plt.rcParams['axes.prop_cycle'].by_key()['color'])

colors = {
    "DEL":ccycle[3], # Heterozygous deletion
    "DUP":ccycle[2], # Duplication
    "LOH":ccycle[9], # LOH
    "RES":ccycle[8], # Restoration (cov = 1 previously 0.5)
    "HDE":ccycle[5], # Homozygous deletion
    "ancestor":ccycle[0],
    "midpoint":ccycle[2],
    "endpoint":ccycle[1], # and "offspring" for ARC
    "nanopore":ccycle[3],
}

# Get LOH clusters per sample

In [4]:
samples = {}

for file in os.listdir("./") :
    
    if file.endswith(".clusters.LOH.pkl.bz2") :
        s = file.split(".")
        samples[s[0]] = os.path.abspath(file)

for sm, cluster_file in samples.items() :
    print(sm, cluster_file)

D4A3 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/D4A3.clusters.LOH.pkl.bz2
P0_C9_E4 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/P0_C9_E4.clusters.LOH.pkl.bz2
30H_C36_E5 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/30H_C36_E5.clusters.LOH.pkl.bz2
D5C3 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/D5C3.clusters.LOH.pkl.bz2
P250_C17_E4 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/P250_C17_E4.clusters.LOH.pkl.bz2
H2B4 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/H2B4.clusters.LOH.pkl.bz2
P0_C9_E5 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/P0_C9_E5.clusters.LOH.pkl.bz2
D2B3 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/D2B3.clusters.LOH.pkl.bz2
D3A3 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/D3A3.clusters.LOH.pkl.bz2
D3A1 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/D3A1.clusters.LOH.pkl.bz2
P500_C30_E3 /media/urbe/MyADrive1/Antoine/2023-08-28_Analysis_GR_ME/P500_C30_E3.clusters

In [5]:
# COVERAGE
median_coverage = {
    "ancestor": 341,
    "D2A1": 47,
    "D2B3": 264, # has a 50G
    "D2B3_50G": 408,
    "D2C1": 110, # has a 50G
    "D2C1_50G": 137,
    "D2C3": 106, # has a 50G
    "D2C3_50G": 118,
    "D3A1": 49,
    "D3A3": 105,  # has a 50G
    "D3A3_50G": 177,
    "D4A3": 54,
    "D4B4": 144, # has a 50G
    "D4B4_50G":176,
    "D5B3": 73,
    "D5C1": 120, # has a 50G
    "D5C1_50G": 127,
    "D5C3": 156, # has a 50G
    "D5C3_50G": 194,
    "H2A3": 449,
    "H2B4": 122, # has a 50G
    "H2B4_50G": 27, # LOW COVERAGE SAMPLE
    "H2C3": 148,
    "H3A4": 90, # has a 50G
    "H3A4_50G": 150,
    "H3C4": 127, # has a 50G
    "H3C4_50G": 111,
    "H4A4": 409, # has a 50G
    "H4A4_50G": 404,
    "H4C2": 204,
    "H5A2": 54, # has a 50G
    "H5A2_50G": 63,
    "H5A3": 202,
    "H5A4": 126, # has a 50G
    "H5A4_50G": 131,
    "H5C2": 148, # has a 50G
    "H5C2_50G": 165,
    
    "30H_C3_E4":  199,
    "30H_C3_E5":  209,
    "30H_C36_E5": 244,
    "30H_C48_E5": 275,
    
    "30D_C13_E3": 259,
    "30D_C38_E4": 193,
    "30D_C38_E5": 236,
    "30D_C52_E5": 208,
    
    "P0_C9_E4":   186,
    "P0_C9_E5":   174,
    "P0_C27_E5":  242,
    "P0_C40_E5":  192,
    
    "P100_C8_E3": 413,
    "P100_C8_E4": 431,
    "P100_C30_E3":424,
    "P100_C30_E4":490,
    
    "P250_C8_E3": 390,
    "P250_C8_E4": 334,
    "P250_C17_E3":354,
    "P250_C17_E4":435,
    
    "P500_C16_E4":185,
    "P500_C16_E5":187,
    "P500_C18_E3":291,
    "P500_C30_E3":275,
}

### Extract detected LOH clusters

In [6]:
min_snps = 4
min_cov_perc = 0.25
max_share = 2
max_distance_between_cluster = 100000 # 100Kb for plotting

all_clusters_merged = {}
all_dfs = {}

for sample, csr_f in samples.items() :
    
    df = pd.read_pickle(csr_f)
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    #clusters = df.loc[(df["SH", "mean"] < max_share) & (df["ODP", "mean"] >= min_cov_perc*median_coverage[sample]) & (df["GET", "first"] == "L")]
    mn_cv = min_cov_perc*median_coverage[sample]
    clusters = df.query("SH_mean < @max_share & ODP_mean >= @mn_cv & GET_first == 'L'")
    
    all_dfs[sample] = clusters
    
    gb = clusters.groupby(by="CHROM_first")
    
    modified = []
    for tup in gb :
        (i, cdf) = tup
        delta = []
        previous = None
        for n, row in cdf.iterrows() :
            if previous is not None :
                diff = row["POS_first"] - previous
                delta.append(diff)
                previous = row["POS_last"]
            else :
                delta.append(0)
                previous = row["POS_last"]
        cdf = cdf.assign(DELTA=delta)
        cdf["GROUP"] = cdf["DELTA"].gt(max_distance_between_cluster).cumsum()

        cdf = cdf.groupby("GROUP").agg({"CHROM_first":"first", "POS_first":"min", "POS_last":"max", "ADP_min":"min", "ADP_max":"max", "ADP_mean":"mean", "ODP_min":"min", "ODP_max":"max", "ODP_mean":"mean", "GET_first":"first","GET_size":"sum","SH_min":"min","SH_max":"max","SH_mean":"mean"})
        cdf = cdf.assign(SIZE=cdf.apply(lambda x: x["POS_last"]-x["POS_first"]+1, axis="columns"))
        modified.append(cdf)

    gdf = pd.concat(modified)
    gdf = gdf.assign(OPER=gdf.apply(lambda x: (x["ODP_mean"]/median_coverage[sample])*100, axis="columns"))
    gdf = gdf.assign(APER=gdf.apply(lambda x: (x["ADP_mean"]/median_coverage["ancestor"])*100, axis="columns"))
    gdf = gdf.assign(DIFF=gdf.apply(lambda x: x["OPER"]-x["APER"], axis="columns"))
    gdf = gdf.query("GET_size > @min_snps")
    gdf = gdf.assign(RATIO=gdf["GET_size"]/gdf["SIZE"])
    all_clusters_merged[sample] = gdf

In [43]:
all_clusters_merged["H5A4"].query("CHROM_first == 'Chrom_6' & RATIO > 0.001")[["POS_first", "POS_last", "GET_size", "RATIO"]]

Unnamed: 0_level_0,POS_first,POS_last,GET_size,RATIO
GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37,12903403,12903462,7,0.116667
39,13225074,13582122,490,0.001372


### Find coverage files per sample

In [46]:
cov_data = {}

for sample in samples.keys() :
    
    if sample.startswith("H") or sample.startswith("D") :
        covdir = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/"
        covfile = os.path.join(covdir, sample + ".sorted.CALL.bam.cov.gz")
        if not os.path.isfile(covfile) :
            print("ERROR: could not find file: {}".format(covfile))
        
        mid_covfile = os.path.join(covdir, sample + "_50G" + ".sorted.CALL.bam.cov.gz")
        if os.path.isfile(mid_covfile) :
            cov_data[sample+"_50G"] = mid_covfile
        
    else :
        covdir = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/"
        covfile = os.path.join(covdir, sample + ".sorted.CALL.bam.cov.gz")
        if not os.path.isfile(covfile) :
            print("ERROR: could not find file: {}".format(covfile))

    cov_data[sample] = covfile
for k, v in cov_data.items() :
    print(k, v)

D4A3 /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/D4A3.sorted.CALL.bam.cov.gz
P0_C9_E4 /media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/P0_C9_E4.sorted.CALL.bam.cov.gz
30H_C36_E5 /media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/30H_C36_E5.sorted.CALL.bam.cov.gz
D5C3_50G /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/D5C3_50G.sorted.CALL.bam.cov.gz
D5C3 /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/D5C3.sorted.CALL.bam.cov.gz
P250_C17_E4 /media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/P250_C17_E4.sorted.CALL.bam.cov.gz
H2B4_50G /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/H2B4_50G.sorted.CALL.bam.cov.gz
H2B4 /media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/coverage/H2B4.sorted.CALL.bam.cov.gz
P0_C9_E5 /media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/P0_C9_E5.sorted.CALL.bam.cov.gz
D2B3_50G /media/urbe/MyBDrive1/Antoine/27-10-21

### Function to parse VCF

In [47]:
def parse_vcf(vcf, ancestor, sample) :
    
    dc = {
        "CHROM":[], "POS":[], "QUAL":[], # general
        "ADP":[], "AAD":[], "AAF":[], # ancestor
        "IDP":[], "IAD":[], "IAF":[], # illumina offspring
    }
    
    # Parse variant file
    vcf_in = VariantFile(vcf)  # auto-detect input format
    vcf_in.subset_samples([ancestor, sample])
    
    # For record in sample
    for rec in vcf_in :
        
        dc["CHROM"].append(rec.chrom)
        dc["POS"].append(rec.pos)
        dc["QUAL"].append(rec.qual)
        
        try :
            dc["ADP"].append(rec.samples[ancestor]["DP"])
        except :
            dc["ADP"].append(None)
        try :
            dc["IDP"].append(rec.samples[sample]["DP"])
        except :
            dc["IDP"].append(None)
        
        try :
            dc["AAD"].append(rec.samples[ancestor]["AD"])
            try :
                dc["AAF"].append(max(rec.samples[ancestor]["AD"])/rec.samples[ancestor]["DP"])
            except :
                dc["AAF"].append(None)
        except :
            dc["AAD"].append(None)
            dc["AAF"].append(None)
        
        try :
            dc["IAD"].append(rec.samples[sample]["AD"])
            try :
                dc["IAF"].append(max(rec.samples[sample]["AD"])/rec.samples[sample]["DP"])
            except :
                dc["IAF"].append(None)
        except :
            dc["IAD"].append(None)
            dc["IAF"].append(None)
        
    vcf_in.close()
    df_illu = pd.DataFrame.from_dict(dc)
    
    return df_illu

In [48]:
vcf_MA = "/media/urbe/MyBDrive1/Antoine/27-10-21_VariantCalling_MA/genotype_allsamples/merged.only_het.gets.bcf"
vcf_GR = "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/jointgenotyping/merged.only_het.gets.bcf"

### Get coverage of ancestor and controls (same for most samples)

Ancestor coverage

In [49]:
cov_anc = pd.read_csv(
    "/media/urbe/MyADrive1/Antoine/19-11-21_VariantCalling_ARC/coverage/ancestor.sorted.CALL.bam.cov.gz",
    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
)

cov_anc = cov_anc.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ANC"})

ME control coverage (H2C3) (alternative = H3C4)

In [50]:
ME_control_sample = "H5C2"
cov_control_ME = pd.read_csv(
    cov_data[ME_control_sample],
    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
)
cov_control_ME = cov_control_ME.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})

ME control variants (H5C2) (alternative = H3C4)

In [51]:
vcf_control_ME = parse_vcf(vcf_MA, "ancestor", ME_control_sample)

GR control coverage (desiccation control = 30H_C48_E5, irradiation control = P0_C27_E5)

In [52]:
GR_control_sample_irradiation = "P0_C27_E5" # Alternative control = P0_C40_E5
cov_GR_ctl_IR = pd.read_csv(
    cov_data[GR_control_sample_irradiation],
    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
)
cov_GR_ctl_IR = cov_GR_ctl_IR.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})

GR_control_sample_desiccation = "30H_C48_E5" # Alternative control = 30H C36 E5
cov_GR_ctl_DESIC = pd.read_csv(
    cov_data[GR_control_sample_desiccation],
    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
)
cov_GR_ctl_DESIC = cov_GR_ctl_DESIC.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})

GR controls variants (desiccation control = 30H_C48_E5, irradiation control = P0_C27_E5)

In [53]:
vcf_control_GR_IR = parse_vcf(vcf_GR, "ancestor", GR_control_sample_irradiation)

In [54]:
vcf_control_GR_DESIC = parse_vcf(vcf_GR, "ancestor", GR_control_sample_desiccation)

### Functions to produce plots

In [69]:
def get_LOH_score(anc_AF, off_AF) :
    #delta = anc_AF["AF", "mean"] - off_AF["AF", "mean"]
    delta = anc_AF - off_AF
    return 2*abs(delta) # 1.0 = LOH, 0.0 = no changes

def get_COV_score(anc_COV, off_COV) :
    return off_COV - anc_COV

def plot_cluster(sample, control, chrom, start, end, df, df_ctl, df_mid=None, win_snps=5, win_cov=5) :
    
    has_mid = True if df_mid is not None else False
    
    plt.ioff()
    fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(4,3))

    # Data
    flt_end = df.query("CHROM == @chrom & POS > @start & POS < @end & QUAL >= 1000")
    flt_end = flt_end.dropna()
    
    # Data 50G
    if has_mid :
        flt_mid = df_mid.query("CHROM == @chrom & POS > @start & POS < @end & QUAL >= 1000")
        flt_mid = flt_mid.dropna()
    
    # Control
    flt_ctl = df_ctl.query("CHROM == @chrom & POS > @start & POS < @end & QUAL >= 1000")
    flt_ctl = flt_ctl.dropna()

    ### LOH SCORE ###
    AAF_end = flt_end["AAF"].rolling(window=win_snps).mean()
    AF_end = flt_end["IAF"].rolling(window=win_snps).mean()
    LOH_score_end = [get_LOH_score(x, y) for x, y in zip(AAF_end, AF_end)]
    LOH_score_scatter_end = [get_LOH_score(x, y) for x, y in zip(flt_end["AAF"], flt_end["IAF"])]
    
    AAF_ctl = flt_ctl["AAF"].rolling(window=win_snps).mean()
    AF_ctl = flt_ctl["IAF"].rolling(window=win_snps).mean()
    LOH_score_ctl = [get_LOH_score(x, y) for x, y in zip(AAF_ctl, AF_ctl)]
    LOH_score_scatter_ctl = [get_LOH_score(x, y) for x, y in zip(flt_ctl["AAF"], flt_ctl["IAF"])]
    
    if has_mid :
        AAF_mid = flt_mid["AAF"].rolling(window=win_snps).mean()
        AF_mid = flt_mid["IAF"].rolling(window=win_snps).mean()
        LOH_score_mid = [get_LOH_score(x, y) for x, y in zip(AAF_mid, AF_mid)]
        LOH_score_scatter_mid = [get_LOH_score(x, y) for x, y in zip(flt_mid["AAF"], flt_mid["IAF"])]
    
    # Plot data
    ax = axs[0]
    ax.plot(flt_end["POS"], LOH_score_end, color=colors["endpoint"]+"BF", lw=1.5, zorder=5, label=sample)
    ax.scatter(flt_end["POS"], LOH_score_scatter_end, fc=colors["endpoint"]+"80", marker=".", s=6, lw=0.0, zorder=1)
    ax.plot(flt_ctl["POS"], LOH_score_ctl, color=colors["ancestor"]+"BF", lw=1.5, zorder=3, label="{} (control)".format(control))
    ax.scatter(flt_ctl["POS"], LOH_score_scatter_ctl, fc=colors["ancestor"]+"80", marker=".", s=6, lw=0.0, zorder=1)
    if has_mid :
        ax.plot(flt_mid["POS"], LOH_score_mid, color=colors["midpoint"]+"BF", lw=1.5, zorder=4, label="{} (midpoint)".format(sample))
        ax.scatter(flt_mid["POS"], LOH_score_scatter_mid, fc=colors["midpoint"]+"80", marker=".", s=6, lw=0.0, zorder=1)
    # Format plot
    xticks = np.arange(start, end+1, 1e4)
    ax.set_xticks(xticks)
    ticklabels = ["" for i in xticks]
    ax.set_xticklabels(ticklabels)

    ax.set_ylim(-0.1, 1.1)
    ax.set_ylabel("LOH Score", fontsize=8)
    afticks = [0.0, 0.25,0.5,0.75,1.0]
    ax.set_yticks(afticks)
    
    ax.grid(axis='y', zorder=1, lw=0.5, color=[0.5,0.5,0.5])
    ax.legend(loc="upper right", ncol=1, fontsize=6, frameon=True,)

    ### Coverage ###
    ax = axs[1]

    ADP = flt_end["COV_ANC"]/median_coverage["ancestor"]
    DP_end = flt_end["COV_ILU"]/median_coverage[sample]
    ADP_rol = ADP.rolling(window=win_cov).mean()
    DP_end_rol = DP_end.rolling(window=win_cov).mean()
    COV_end = [get_COV_score(x, y) for x, y in zip(ADP_rol, DP_end_rol)]

    ADP = flt_ctl["COV_ANC"]/median_coverage["ancestor"]
    DP_ctl = flt_ctl["COV_ILU"]/median_coverage[control]
    ADP_rol = ADP.rolling(window=win_cov).mean()
    DP_ctl_rol = DP_ctl.rolling(window=win_cov).mean()
    COV_ctl = [get_COV_score(x, y) for x, y in zip(ADP_rol, DP_ctl_rol)]
    
    if has_mid :
        ADP = flt_mid["COV_ANC"]/median_coverage["ancestor"]
        DP_mid = flt_mid["COV_ILU"]/median_coverage[sample+"_50G"]
        ADP_rol = ADP.rolling(window=win_cov).mean()
        DP_mid_rol = DP_mid.rolling(window=win_cov).mean()
        COV_mid = [get_COV_score(x, y) for x, y in zip(ADP_rol, DP_mid_rol)]
    
    ax.plot(flt_end["POS"], COV_end, color=colors["ancestor"]+"BF", lw=1.5, zorder=5, label=sample)
    ax.plot(flt_ctl["POS"], COV_ctl, color=colors["endpoint"]+"BF", lw=1.5, zorder=2, label="{} (control)".format(control))
    if has_mid :
        ax.plot(flt_mid["POS"], COV_mid, color=colors["midpoint"]+"BF", lw=1.5, zorder=3, label="{} (midpoint)".format(sample))

    ax.yaxis.grid(True, zorder=1)
    ax.set_ylim(-0.6, 0.6)
    ax.set_yticks([-0.5, -0.25, 0.0, 0.25, 0.5])
    ax.tick_params(labelsize=8)
    ax.set_ylabel(r"$\Delta$ Coverage", fontsize=8)

    start = min(flt_end["POS"])
    end = max(flt_end["POS"])
    ticks = np.linspace(start, end, 8)
    ticklabels = []
    for t in ticks :
        ticklabels.append("{:.3f}M".format(round(t/1000000, 3)))
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticklabels, fontsize=8, rotation=45) #, fontsize=12)
    ax.set_xlabel("{}: {:.2f}M - {:.2f}M ({:.2f}K)".format(chrom, start/1e6, end/1e6, (end-start)/1e3), fontsize=8)

    for ax in axs :
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.grid(axis='y', zorder=1, lw=0.5, color=[0.5,0.5,0.5])
        ax.tick_params(axis="y", length=0.5)
        ax.tick_params(labelsize=6)
        #ax.set_xlim(0, max(COV[contig]["POS", "first"])+1e4) #+1e5)

    plt.subplots_adjust(hspace=0.03)
    plt.tight_layout()
    
    plt.savefig("./all_events_check/{}_{}_{}_{}.png".format(sample, chrom, start, end), dpi=300, format="png")
    plt.ion()
    plt.close()

# Run sample per sample, cluster per cluster

In [70]:
for sample in samples.keys() :
    
    print("Sample: {}".format(sample))
    
    is_MA = True if (sample.startswith("H") or sample.startswith("D")) else False
    vcf = vcf_MA if is_MA else vcf_GR
    
    clusters = all_clusters_merged[sample].query("RATIO > 0.001")
    cov_end = pd.read_csv(
        cov_data[sample], sep="\t", compression="gzip",
        usecols=range(3), names=["ref", "pos", "cov"],
        header=None, skiprows=1,
    )
    cov_end = cov_end.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})
    
    has_mid = False
    sample_mid = None
    try :
        sample_mid = sample+"_50G"
        cov_mid = pd.read_csv(
            cov_data[sample_mid], sep="\t", compression="gzip",
            usecols=range(3), names=["ref", "pos", "cov"],
            header=None, skiprows=1,
        )
        cov_mid = cov_mid.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})
        has_mid = True
        
    except :
        has_mid = False
    
    df = parse_vcf(vcf, "ancestor", sample)
    df = pd.merge(df, cov_anc, on=["CHROM", "POS"], how="inner")
    df = pd.merge(df, cov_end, on=["CHROM", "POS"], how="inner")
    
    if has_mid :
        print("Sample has a midpoint")
        df_50g = parse_vcf(vcf, "ancestor", sample_mid)
        df_50g = pd.merge(df_50g, cov_anc, on=["CHROM", "POS"], how="inner")
        df_50g = pd.merge(df_50g, cov_mid, on=["CHROM", "POS"], how="inner")
    
    
    current_control = None
    if is_MA : # ME controls
        ctl_cov = cov_control_ME
        ctl_vcf = vcf_control_ME
        
        
        if sample == ME_control_sample :
            alternative_control = "H3C4"
            ctl_cov = pd.read_csv(
                cov_data[alternative_control],
                sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
            )
            ctl_cov = ctl_cov.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})
            ctl_vcf = parse_vcf(vcf_MA, "ancestor", alternative_control)
            current_control = alternative_control
        else :
            current_control = ME_control_sample
        
        df_ctl = pd.merge(ctl_vcf, cov_anc, on=["CHROM", "POS"], how="inner")
        df_ctl = pd.merge(df_ctl, ctl_cov, on=["CHROM", "POS"], how="inner")
            
    else : # GR controls
        if sample.startswith("30") :
            ctl_cov = cov_GR_ctl_DESIC
            ctl_vcf = vcf_control_GR_DESIC
            
            if sample == GR_control_sample_desiccation :
                alternative_control = "30H_C36_E5"
                ctl_cov = pd.read_csv(
                    cov_data[alternative_control],
                    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
                )
                ctl_cov = ctl_cov.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})
                ctl_vcf = parse_vcf(vcf_GR, "ancestor", alternative_control)
                current_control = alternative_control
            else :
                current_control = GR_control_sample_desiccation
            
            
        elif sample.startswith("P") :
            ctl_cov = cov_GR_ctl_IR
            ctl_vcf = vcf_control_GR_IR
            if sample == GR_control_sample_irradiation :
                alternative_control = "P0_C40_E5"
                ctl_cov = pd.read_csv(
                    cov_data[alternative_control],
                    sep="\t", compression="gzip", usecols=range(3), names=["ref", "pos", "cov"], header=None, skiprows=1,
                )
                ctl_cov = ctl_cov.rename(columns={"ref":"CHROM", "pos":"POS", "cov":"COV_ILU"})
                ctl_vcf = parse_vcf(vcf_GR, "ancestor", alternative_control)
                current_control = alternative_control
            else :
                current_control = GR_control_sample_irradiation
                
        
        df_ctl = pd.merge(ctl_vcf, cov_anc, on=["CHROM", "POS"], how="inner")
        df_ctl = pd.merge(df_ctl, ctl_cov, on=["CHROM", "POS"], how="inner")
    print("Selected control sample: {}".format(current_control))
    
    
    for i, cluster in clusters.iterrows() :
        
        chrom = cluster["CHROM_first"]
        start = cluster["POS_first"]
        end = cluster["POS_last"]
        size = end-start

        if size < 1000 :
            continue
        
        print(chrom, start, end, size)
        
        for s in [1000,10000,100000,1000000] :
            if size < s :
                offset = int(s/2)
                break
            
        start -= offset
        end += offset
        
        if is_MA :
            if has_mid : # sample has midpoint
                plot_cluster(sample, current_control, chrom, start, end, df, df_ctl, df_mid=df_50g)
            else : # sample has no midpoint
                plot_cluster(sample, current_control, chrom, start, end, df, df_ctl, df_mid=None)
                
        else : # GR instead (no midpoint)
            plot_cluster(sample, current_control, chrom, start, end, df, df_ctl, df_mid=None)
        
    #break

Sample: D4A3
Selected control sample: H5C2
Chrom_1 12402885 12414163 11278
Chrom_1 15647234 15659203 11969
Chrom_2 14761866 14764935 3069
Chrom_3 4578037 4581104 3067
Chrom_5 7491315 7506284 14969
Chrom_5 11236272 11246751 10479
Chrom_6 984495 989647 5152
Chrom_6 13832344 13841164 8820
Sample: P0_C9_E4
Selected control sample: P0_C27_E5
Chrom_1 7434724 7438812 4088
Chrom_1 16355621 16359823 4202
Chrom_6 3937762 3941860 4098
Chrom_6 5684002 5687465 3463
Sample: 30H_C36_E5
Selected control sample: 30H_C48_E5
Chrom_1 9736375 9740606 4231
Chrom_1 13945929 13954181 8252
Sample: D5C3
Sample has a midpoint
Selected control sample: H5C2
Chrom_1 67473 117985 50512
Chrom_1 10321732 15446271 5124539
Chrom_1 15612507 16420902 808395
Chrom_1 16628527 16746183 117656
Chrom_2 3761565 3763040 1475
Chrom_3 17576357 17680853 104496
Chrom_3 17843688 17870564 26876
Chrom_4 262613 365562 102949
Chrom_4 2189336 2229239 39903
Chrom_4 12544960 12587573 42613
Chrom_5 30 67903 67873
Chrom_5 1310811 1330994 2018

Chrom_1 11126704 11163940 37236
Chrom_1 16022761 16024708 1947
Chrom_2 1012685 1042409 29724
Chrom_3 14134561 14147333 12772
Chrom_6 3940732 3942697 1965
Sample: H2C3
Selected control sample: H5C2
Chrom_1 12491776 13284254 792478
Chrom_1 17112776 17142866 30090
Chrom_3 10167869 10169489 1620
Chrom_3 13625475 13632254 6779
Chrom_3 20281041 20284215 3174
Chrom_4 9689134 9760554 71420
Chrom_4 13246883 13249046 2163
Chrom_5 2059490 2061892 2402
Chrom_5 2394192 2399176 4984
Chrom_5 2567555 2571170 3615
Chrom_6 2897866 4197158 1299292
Chrom_6 13581408 13588883 7475
Sample: 30D_C52_E5
Selected control sample: 30H_C48_E5
Chrom_2 13361917 13364189 2272
Chrom_3 714461 718577 4116
Sample: H5A2
Sample has a midpoint
Selected control sample: H5C2
Chrom_1 1661127 1672032 10905
Chrom_1 2928339 2985691 57352
Chrom_1 15647057 15659450 12393
Chrom_1 17179947 17203746 23799
Chrom_2 5665797 5670063 4266
Chrom_2 13647676 13650747 3071
Chrom_4 15010387 15215653 205266
Chrom_5 30 273596 273566
Chrom_5 157254