Clinical-Genomics · mathiasbio · Oct 27, 2023 · Sep 27, 2023 · Sep 27, 2023 · Oct 2, 2023
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+import click
+import io
+import logging
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+from statistics import mean
+
+from BALSAMIC.constants.analysis import SequencingType
+from BALSAMIC.constants.tools import GENS_PARAMS
+from BALSAMIC.utils.io import read_vcf_file
+
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+
+
+@click.group()
+@click.option(
+    "-o",
+    "--output-file",
+    required=True,
+    type=click.Path(exists=False),
+    help="Name of output-file.",
+)
+@click.option(
+    "-s",
+    "--sequencing-type",
+    required=True,
+    type=click.Choice([SequencingType.WGS]),
+    help="Sequencing type used.",
+)
+@click.pass_context
+def cli(ctx: click.Context, output_file: str, sequencing_type: SequencingType):
+    """GENS pre-processing tool."""
+    ctx.ensure_object(dict)
+    ctx.obj["output_file"] = output_file
+    ctx.obj["sequencing_type"] = sequencing_type
+
+
+@cli.command()
+@click.pass_context
+@click.option(
+    "-v",
+    "--vcf-file-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Input VCF from germline-caller with SNVs & InDels from DNAscope, called with --given gnomad_af_0.05.vcf ",
+)
+def calculate_bafs(ctx: click.Context, vcf_file_path: str):
+    """
+    Processes vcf-file from DNAscope into a bed-file format for GENS, with different number of variants for each zoom-level.
+
+    Args:
+        vcf_file: From DNAscope created with --given argument using gnomad_af_min_0.05.vcf.
+
+    Outputs bed-file in file-name specified in output-file.
+    """
+
+    LOG.info("Calculating BAFs from VCF.")
+    LOG.info("Reading VCF file...")
+    vcf_lines: List = read_vcf_file(vcf_file_path)
+    LOG.info("Extracting and processing variant info...")
+    variants: Dict = get_valid_variants(vcf_lines)
+    LOG.info("Writing variant b-allele-frequencies to output...")
+    output_file: Path = Path(ctx.obj["output_file"])
+    sequencing_type: SequencingType = ctx.obj["sequencing_type"]
+    write_b_allele_output(variants, output_file, sequencing_type)
+
+
+def get_valid_variants(vcf_lines: List[str]) -> Dict:
+    """
+    Process VCF lines to extract valid variants.
+
+    Args:
+        vcf_lines (List[str]): List of VCF lines to be processed.
+
+    Returns:
+        dict: A dictionary containing valid variants with variant IDs as keys and variant info as values.
+
+    This function takes a list of VCF lines, processes them, and extracts valid variants.
+    Variants are stored in a dictionary with variant IDs as keys and variant information as values.
+    If any invalid variants are encountered, appropriate warnings are printed.
+
+    Example usage:
+        vcf_lines: List = [...]  # List of VCF lines
+        valid_variants: Dict = get_valid_variants(vcf_lines)
+    """
+    count_invalid_vars: int = 0
+    illegal_chromosomes: Dict = {}
+    variants = {}
+    variant_id: int = 0
+    for variant_line in vcf_lines:
+        if variant_line.startswith("#"):
+            continue
+
+        variant: Dict = extract_variant_info(variant_line)
+        if not variant:
+            count_invalid_vars += 1
+            continue
+
+        v_chrom = variant["chr"]
+        if v_chrom not in GENS_PARAMS["ALLOWED_CHR_LIST"]:
+            illegal_chromosomes[v_chrom] = illegal_chromosomes.get(v_chrom, 0) + 1
+            continue
+
+        variants[variant_id] = variant
+        variant_id += 1
+
+    if count_invalid_vars:
+        LOG.warning(
+            f"Warning: Can't calc AF for a number of variants: {count_invalid_vars}."
+        )
+    if illegal_chromosomes:
+        LOG.warning(
+            f"Warning: A number of variants have illegal chromosomes and will be skipped: {illegal_chromosomes}."
+        )
+    return variants
+
+
+def extract_variant_info(variant: str) -> Union[Dict, None]:
+    """
+    Extracts genetic variant information.
+
+    Args:
+        variant (str): Tab-separated string representing a genetic variant.
+
+    Returns:
+        dict or None: Dictionary with variant details ('chr', 'start', 'ref', 'alt', 'sample', 'af').
+        Returns None for uninformative samples or division by zero.
+
+    Raises:
+        ValueError: If variant string lacks expected fields.
+
+    Example:
+        variant_string = "chr1\t1000\t.\tA\tT\t.\t.\t.\tGT:AD:DP:GQ:PL\t0/1:10,5:15:45:45,0,20"
+        extract_variant_info(variant_string)
+        {'chr': 'chr1', 'start': '1000', 'ref': 'A', 'alt': 'T', 'sample': '0/1:10,5:15:45:45,0,20', 'af': 0.333333}
+    """
+    fields: List[str] = variant.split("\t")
+    try:
+        variant_info = {
+            "chr": fields[0],
+            "start": fields[1],
+            "ref": fields[3],
+            "alt": fields[4],
+            "sample": fields[9],
+        }
+    except IndexError:
+        error_message = "Invalid variant string format"
+        LOG.error(f"{error_message}")
+        raise ValueError(f"{error_message}")
+
+    if variant_info["sample"].split(":")[0] == "./.":
+        return None
+
+    try:
+        allele_depths: List[str] = variant_info["sample"].split(":")[1]
+        VD = int(allele_depths.split(",")[1])
+        DP = int(variant_info["sample"].split(":")[2])
+        variant_info["af"] = round(VD / DP, 6)
+
+    except (ValueError, ZeroDivisionError, IndexError):
+        return None
+
+    return variant_info
+
+
+def write_b_allele_output(
+    variants: Dict, output_file: Path, sequencing_type: SequencingType
+):
+    """
+    Writes B-allele frequency (BAF) output to a file for each level of GENS zoom specified in prefix of BAF_SKIP_N.
+
+    Args:
+        variants (Dict): A dictionary containing variant information. Each variant should have the following keys:
+            - 'start' (int): The start position of the variant.
+            - 'chr' (str): The chromosome where the variant is located.
+            - 'af' (float): The allele frequency of the variant.
+
+        output_file (Path): The file path where the BAF output will be written.
+
+        sequencing_type (SequencingType): The sequencing type used in analysis.
+
+    Returns:
+        None
+
+    Writes BAF information for each variant in the provided `variants` dictionary to the specified `output_file`.
+    The output is formatted as follows:
+        <prefix>_<chromosome>    <start>    <end>    <allele_frequency>
+
+    Note:
+        This function uses a predefined constant BAF_SKIP_N, which determines how many variants to skip before writing.
+    """
+    SEQ_GENS_PARAMS: Dict = GENS_PARAMS["SEQUENCING_TYPE"][sequencing_type]
+    BAF_SKIP_N: Dict = SEQ_GENS_PARAMS["BAF_SKIP_N"]
+
+    with open(output_file.as_posix(), "w") as baf_out:
+        for prefix, req_skip_count in BAF_SKIP_N.items():
+            skip_count = int(req_skip_count)
+            for variant_id in variants:
+                variant: Dict = variants[variant_id]
+                if skip_count == req_skip_count:
+                    v_start: int = int(variant["start"])
+                    v_chrom: str = variant["chr"]
+                    v_af: float = variant["af"]
+                    baf_out.write(
+                        f"{prefix}_{v_chrom}\t{v_start}\t{v_start + 1}\t{v_af}\n"
+                    )
+                    skip_count = 0
+                else:
+                    skip_count += 1
+
+
+@cli.command()
+@click.pass_context
+@click.option(
+    "-c",
+    "--normalised-coverage-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Input normalised coverage from GATK DenoiseReadCounts.",
+)
+def create_coverage_regions(ctx: click.Context, normalised_coverage_path: str) -> None:
+    """
+    Calculate coverage data.
+
+    Args:
+        normalised_coverage_path: Path to normalised coverage file.
+
+    Returns:
+        None
+    """
+    LOG.info("Creating coverage regions for GENS.")
+    normalised_coverage_path = Path(normalised_coverage_path)
+    output_file: Path = Path(ctx.obj["output_file"])
+    sequencing_type: SequencingType = ctx.obj["sequencing_type"]
+    SEQ_GENS_PARAMS: Dict = GENS_PARAMS["SEQUENCING_TYPE"][sequencing_type]
+    COV_REGION_SIZES: Dict = SEQ_GENS_PARAMS["COV_REGION_SIZES"]
+    with open(output_file.as_posix(), "w") as cov_out:
+        for prefix, region_size in COV_REGION_SIZES.items():
+            LOG.info(
+                f"Creating regions for prefix: {prefix}, region_size: {region_size}."
+            )
+            generate_cov_bed(normalised_coverage_path, region_size, prefix, cov_out)
+
+
+def write_coverage_region(
+    prefix: str,
+    region_chr: str,
+    region_start: int,
+    region_end: int,
+    reg_ratios: List[float],
+    cov_out: io.TextIOWrapper,
+) -> None:
+    """
+    Write coverage region information to an output file.
+
+    Args:
+        prefix (str): Prefix for the output.
+        region_chr (str): Chromosome for the region.
+        region_start (int): Start position of the region.
+        region_end (int): End position of the region.
+        reg_ratios (List[float]): List of log2 ratios.
+        cov_out (io.TextIOWrapper): Output file.
+
+    Returns:
+        None
+    """
+    mid_point = region_start + (region_end - region_start) // 2
+    cov_out.write(
+        f"{prefix}_{region_chr}\t{mid_point - 1}\t{mid_point}\t{mean(reg_ratios)}\n"
+    )
+
+
+def extract_coverage_line_values(coverage_line: str) -> Tuple[str, int, int, float]:
+    """
+    Extract coverage region values from a coverage line.
+
+    Args:
+        coverage_line (str): A line containing coverage and genomic position information.
+
+    Returns:
+        Tuple[str, int, int, float]: Extracted values (chr, start, end, log2_ratio).
+    """
+    # Extract coverage region values
+    chr_start_stop_ratio: List = coverage_line.strip().split("\t")
+    chrom: str = chr_start_stop_ratio[0]
+    start, end = int(chr_start_stop_ratio[1]), int(chr_start_stop_ratio[2])
+    log2_ratio: float = float(chr_start_stop_ratio[3])
+    return chrom, start, end, log2_ratio
+
+
+def generate_cov_bed(
+    normalised_coverage_path: Path,
+    region_size_requested: int,
+    prefix: str,
+    cov_out: io.TextIOWrapper,
+) -> None:
+    """
+    Merge coverage data into coverage regions for GENS.
+
+    Args:
+        normalised_coverage_path: Path to normalised coverage file.
+        region_size_requested: Size of the coverage region.
+        prefix: Prefix for the output.
+        cov_out: Output file.
+
+    Returns:
+        None
+    """
+
+    normalised_coverage: List = normalised_coverage_path.read_text().splitlines()
+    minimum_region_size: int = GENS_PARAMS["MINIMUM_REGION_SIZE"]
+
+    region_chrom, chrom, region_start, region_end, end, log2_ratio = [None] * 6
+    first_cov_line: bool = True
+    start_new_region: bool = False
+    for coverage_line in normalised_coverage:
+        if coverage_line.startswith("@") or coverage_line.startswith("CONTIG"):
+            continue
+
+        if first_cov_line or start_new_region:
+            (
+                region_chrom,
+                region_start,
+                region_end,
+                log2_ratio,
+            ) = extract_coverage_line_values(coverage_line)
+            reg_ratios: List = [log2_ratio]
+            first_cov_line: bool = False
+            start_new_region: bool = False
+            if region_size_requested == minimum_region_size:
+                write_coverage_region(
+                    prefix, region_chrom, region_start, region_end, reg_ratios, cov_out
+                )
+                start_new_region: bool = True
+            continue
+        else:
+            chrom, _, end, log2_ratio = extract_coverage_line_values(coverage_line)
+
+        region_size: int = end - region_start + 1
+        if region_size == region_size_requested:
+            # Region size matches requested region size
+            # Step 1: Write region from current line
+            # Step 2: Start new region from new line
+            reg_ratios.append(log2_ratio)
+            write_coverage_region(
+                prefix, region_chrom, region_start, end, reg_ratios, cov_out
+            )
+            start_new_region: bool = True
+            continue
+
+        if region_size > region_size_requested:
+            # Region size larger due to incomplete genome reference
+            # Step 1:  Write region from previous line
+            # Step 2: Start new region from current line
+            # Conceptual example: window_size = 300:
+            # start1 -- 100bases -- end1 + (region_start = start1, region_end = end1)
+            # start2 -- 100bases -- end2 + (region_start = start1, region_end = end2)
+            # -- gap 500 bases --
+            # start3 -- 100bases -- end3
+            write_coverage_region(
+                prefix, region_chrom, region_start, region_end, reg_ratios, cov_out
+            )
+            start_new_region: bool = True
+
+        if chrom != region_chrom:
+            # New chromosome:
+            # Step 1: Write region from previous line
+            # Step 2: Start new region from current line
+            write_coverage_region(
+                prefix, region_chrom, region_start, region_end, reg_ratios, cov_out
+            )
+            start_new_region: bool = True
+
+        if start_new_region:
+            (
+                region_chrom,
+                region_start,
+                region_end,
+                log2_ratio,
+            ) = extract_coverage_line_values(coverage_line)
+            reg_ratios: List = [log2_ratio]
+        else:
+            region_end: int = end
+            reg_ratios.append(log2_ratio)
+
+    # Output last line:
+    write_coverage_region(
+        prefix, region_chrom, region_start, region_end, reg_ratios, cov_out
+    )
+
+
+if __name__ == "__main__":
+    cli()