In [2]:
from tqdm import tqdm
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from matplotlib.backends.backend_pdf import PdfPages
import os
from natsort import natsorted

In [4]:
def plot_snp_coverage(plot_df: pd.DataFrame,
                      sample_name: str,
                      chrom_sizes_dict: dict,
                      centromeres_dict: dict,
                      output_dir: str,) -> None:
    """
    Function to plot SNP coverage for a given sample.

    Parameters
    ----------
    plot_df : pd.DataFrame
        DataFrame containing coverage data.
    sample_name : str
        Name of the sample.
    chrom_sizes_dict : dict
        Dictionary containing chromosome sizes.
    centromeres_dict : dict
        Dictionary containing centromere locations.
    output_dir : str
        Directory to save the plots.

    Returns
    -------
    None
        The function saves the plots to a PDF file.
        
    """

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF file to save the plots to
    pdf_path = os.path.join(output_dir,
                            f'{sample_name}_coverage.pdf')
    
    with PdfPages(pdf_path) as pdf:
        
        fig = plt.figure(figsize=(8.5, 11))
        plt.axis('off')  # Hide axes
        
        # Add title and information text
        plt.text(0.5, 0.9, f"SNP Coverage Analysis for {sample_name}", 
                 ha='center', fontsize=20, weight='bold')
        pdf.savefig()
        plt.close()
    
        # Loop through each chromosome and plot the coverage
        chroms_list = natsorted(list(plot_df['chrom'].unique()))
        for chrom in tqdm(chroms_list):
        
            plt.figure(figsize=(10,5), dpi = 300)
            sns.lineplot(data=plot_df[plot_df['chrom'] == str(chrom)],
                         x='window_start',
                         y='coverage')
            
            # Add another lineplot for the mean quality. It should be opaque,
            # on top of the coverage lineplot, and with a different color.
            # sns.lineplot(data=plot_df[plot_df['chrom'] == str(chrom)],
            #                 x='window_start',
            #                 y='quality',
            #                 color='orange',
            #                 alpha=0.5)
            

            plt.title(f'{sample_name} coverage - chromosome {chrom}')
            plt.xlabel('Base position (based on window start position)')
            plt.ylabel('Mean sliding window coverage')

            plt.vlines(x=chrom_sizes_dict[str(chrom)],
                       ymin=0,
                       ymax=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].max(),
                       color='black',
                       linestyle='-',
                       label='End of chromosome')

            plt.hlines(y=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].mean(),
                       xmin=0,
                       xmax=chrom_sizes_dict[str(chrom)],
                       color='red',
                       linestyle='--',
                       label='Mean coverage')

            plt.hlines(y=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].median(),
                       xmin=0,
                       xmax=chrom_sizes_dict[str(chrom)],
                       color='orange',
                       linestyle='--',
                       label='Median coverage')
            
            if chrom in centromeres_dict:
                # Add an opaque rectangle to the plot to highlight the centromere region
                plt.fill_betweenx(y=[0, plot_df[plot_df['chrom'] == str(chrom)]['coverage'].max()],
                                  x1=centromeres_dict[str(chrom)][0],
                                  x2=centromeres_dict[str(chrom)][1],
                                  color='grey',
                                  alpha=0.5,
                                label='Centromere region')

            # Move the legend outside the plot
            plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=8)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

def simple_plot_snp_coverage(plot_df: pd.DataFrame,
                      sample_name: str,
                      chrom_sizes_dict: dict,
                      output_dir: str,) -> None:
    """
    Function to plot SNP coverage for a given sample. Same as plot_snp_coverage
    but without centromeres.

    Parameters
    ----------
    plot_df : pd.DataFrame
        DataFrame containing coverage data.
    sample_name : str
        Name of the sample.
    chrom_sizes_dict : dict
        Dictionary containing chromosome sizes.
    centromeres_dict : dict
        Dictionary containing centromere locations.
    output_dir : str
        Directory to save the plots.

    Returns
    -------
    None
        The function saves the plots to a PDF file.
        
    """

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF file to save the plots to
    pdf_path = os.path.join(output_dir,
                            f'{sample_name}_coverage.pdf')
    
    with PdfPages(pdf_path) as pdf:
        
        fig = plt.figure(figsize=(8.5, 11))
        plt.axis('off')  # Hide axes
        
        # Add title and information text
        plt.text(0.5, 0.9, f"SNP Coverage Analysis for {sample_name}", 
                 ha='center', fontsize=20, weight='bold')
        pdf.savefig()
        plt.close()
    
        # Loop through each chromosome and plot the coverage
        chroms_list = natsorted(list(plot_df['chrom'].unique()))
        for chrom in tqdm(chroms_list):
        
            plt.figure(figsize=(10,5), dpi = 300)
            sns.lineplot(data=plot_df[plot_df['chrom'] == str(chrom)],
                         x='window_start',
                         y='coverage')
            
            # Add another lineplot for the mean quality. It should be opaque,
            # on top of the coverage lineplot, and with a different color.
            # sns.lineplot(data=plot_df[plot_df['chrom'] == str(chrom)],
            #                 x='window_start',
            #                 y='quality',
            #                 color='orange',
            #                 alpha=0.5)
            

            plt.title(f'{sample_name} coverage - chromosome {chrom}')
            plt.xlabel('Base position (based on window start position)')
            plt.ylabel('Mean sliding window coverage')

            plt.vlines(x=chrom_sizes_dict[str(chrom)],
                       ymin=0,
                       ymax=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].max(),
                       color='black',
                       linestyle='-',
                       label='End of chromosome')

            plt.hlines(y=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].mean(),
                       xmin=0,
                       xmax=chrom_sizes_dict[str(chrom)],
                       color='red',
                       linestyle='--',
                       label='Mean coverage')

            plt.hlines(y=plot_df[plot_df['chrom'] == str(chrom)]['coverage'].median(),
                       xmin=0,
                       xmax=chrom_sizes_dict[str(chrom)],
                       color='orange',
                       linestyle='--',
                       label='Median coverage')

            # Move the legend outside the plot
            plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=8)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

To generate the input data to this notebook, you need to generate a sliding window bed file of the genome or chromosome you want to assess coverage over.

```bash
bedtools makewindows -g <( cut -f 1,2 ~/Projects/REFERENCES/UOA_WAGYU/UOA_Wagyu_1.withY.fa.fai) \
    -w 100000 \
    -s 50000 \
    -i srcwinnum > UOA_Wagyu_1.withY.100Kb.50Kbstep.bed
```

Then you need to intersect this bed file with the VCF file of interest.

```bash
bedtools intersect -a sample01.vcf.gz \
    -b UOA_Wagyu_1.withY.100Kb.50Kbstep.bed \
    -wb > sample01.SNP.bins.tsv
```

In [5]:
# Set the paths to the chrom sizes and centromere files.

CHROM_SIZES = '/Users/callummacphillamy/Projects/REFERENCES/ARS_UCD2.0/ARS_UCD_v2.0.chrom.sizes'
# CENTROMERES = '~/Projects/REFERENCES/UOA_WAGYU/wagyu_centromere_location.txt'

# Load the chrom sizes and make a dictionary.
chrom_sizes = pd.read_csv(CHROM_SIZES, sep='\t', header=None,
                          names=['chrom', 'start', 'size'])
chrom_sizes_dict = {chrom:size for chrom, size in zip(chrom_sizes['chrom'], chrom_sizes['size'])}


# Load the centromeres and make a dictionary.
# centromeres = pd.read_csv(CENTROMERES, sep='\t', header=None,
#                           names=['chromosome','start','end'])
# centromeres_dict = {k:(v) for k, v in zip(centromeres['chromosome'], zip(centromeres['start'], centromeres['end']))}

# Set the output directory for the plots.
os.makedirs('../SNP_analyses/coverage_plots/ARS_Ref', exist_ok=True)

In [6]:
chrom_sizes_dict

{'1': 158534110,
 '2': 136231102,
 '3': 121005158,
 '4': 120000601,
 '5': 120089316,
 '6': 117806340,
 '7': 110682743,
 '8': 113319770,
 '9': 105454467,
 '10': 103308737,
 '11': 106982474,
 '12': 87216183,
 '13': 83472345,
 '14': 82403003,
 '15': 85007780,
 '16': 81013979,
 '17': 73167244,
 '18': 65820629,
 '19': 63449741,
 '20': 71974595,
 '21': 69862954,
 '22': 60773035,
 '23': 52498615,
 '24': 62317253,
 '25': 42350435,
 '26': 51992305,
 '27': 45612108,
 '28': 45940150,
 '29': 51098607,
 'X': 139009144,
 'Y': 59476289}

In [7]:
col_names = ['chrom','pos','id','ref','alt','qual','filter','info', 'format',
             'sample','window_chrom','window_start','window_end', 'window_id']

# coverage = pd.read_csv('../SNP_analyses/genotypes/clair3_rerun/sample01.clair3.sliding.coverage.tsv',
#                        sep='\t', header=0, names=col_names,
#                        dtype={0:str,
#                               10:str})

In [8]:
SAMPLE_NAMES = [f'sample{i:02d}' for i in range(1, 11)]
SAMPLE_NAMES.extend([f'sample{i}' for i in range(11, 21)])


for SAMPLE_NAME in SAMPLE_NAMES:
    print(f'Processing {SAMPLE_NAME}')

    # Load the coverage data
    print('Loading coverage data')
    # Update with the path to the TSV files you made earlier
    coverage = pd.read_csv(f'/Users/callummacphillamy/Projects/tuwa_manuscript/TuWa_manuscript/SNP_analyses/genotypes/clair3_rerun/{SAMPLE_NAME}.SNP.bins.tsv',
                           sep='\t', header=0, names=col_names,
                           dtype={0:str,
                              10:str})
    # Extract the DEPTH from the coverage data
    # This may change depending on how the VCF file is formatted.
    print('Extracting depth from coverage data')
    coverage['coverage'] = coverage['sample'].str.split(':').str[2].astype(int)
    coverage = coverage.rename(columns={'qual':'quality'})
    

    print('Generating plotting dataframe')
    plot_df = coverage.groupby(['window_id']).agg({'coverage':'mean',
                                                   'quality':'mean',
                                     'chrom':'first',
                                     'window_start':'first'}).reset_index()
    
    # Uncomment this block if you want to plot the centromeres
    # plot_snp_coverage(plot_df=plot_df,
    #                   sample_name=SAMPLE_NAME,
    #                   chrom_sizes_dict=chrom_sizes_dict,
    #                   centromeres_dict=centromeres_dict,
    #                   output_dir='../SNP_analyses/coverage_plots/Wagyu_Ref')

    # Plot the coverage without centromeres. If you want the centromeres.
    # Comment out this block and uncomment the one above.
    simple_plot_snp_coverage(plot_df=plot_df,
                              sample_name=SAMPLE_NAME,
                              chrom_sizes_dict=chrom_sizes_dict,
                              output_dir='../SNP_analyses/coverage_plots/ARS_Ref')

Processing sample01
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 11.06it/s]


Processing sample02
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.44it/s]


Processing sample03
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 11.52it/s]


Processing sample04
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.32it/s]


Processing sample05
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.00it/s]


Processing sample06
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 11.89it/s]


Processing sample07
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.45it/s]


Processing sample08
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.55it/s]


Processing sample09
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.45it/s]


Processing sample10
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 11.85it/s]


Processing sample11
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.82it/s]


Processing sample12
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.76it/s]


Processing sample13
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.46it/s]


Processing sample14
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.72it/s]


Processing sample15
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.54it/s]


Processing sample16
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.28it/s]


Processing sample17
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.42it/s]


Processing sample18
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.51it/s]


Processing sample19
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.52it/s]


Processing sample20
Loading coverage data
Extracting depth from coverage data
Generating plotting dataframe


100%|██████████| 31/31 [00:02<00:00, 12.62it/s]


It should then look something like [this](../docs//sample01_coverage.pdf).