Skip to content

Commit

Permalink
Attempt to fix #3 issue with mismatching data type
Browse files Browse the repository at this point in the history
Closes #3.
  • Loading branch information
danpolanco committed Jun 7, 2024
1 parent 50b1e02 commit 48f0d4a
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 44 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Pylint
name: Code Analysis

on: [push]

Expand All @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.12"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -17,7 +17,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
pip install ruff mypy pandas-stubs
- name: Analyze code with ruff
run: ruff check .
- name: Check code format with ruff
run: ruff format --check .
- name: Analyze code with mypy
run: mypy .
117 changes: 80 additions & 37 deletions scripts/calc_percent_coverage.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,113 @@
#! /usr/bin/env python

"""Calculate percent coverage."""

import argparse
import sys
import pandas as pd
import logging
import numpy as np
from datetime import date
import pandas as pd
import re
import sys

from Bio import SeqIO

__author__ = "CDPHE"
__copyright__ = "State of Colorado"
__license__ = "GPL-3.0.0-or-later"

log = logging.getLogger(__name__)


def parse_args(args: list[str]) -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Calculate percent coverage.")

#### FUNCTIONS #####
def getOptions(args=sys.argv[1:]):
parser = argparse.ArgumentParser(description="Parses command.")

parser.add_argument("--sample_name", help= "")
parser.add_argument('--fasta_file', help = '')
parser.add_argument('--reference_file', help = '')
parser.add_argument(
"--log_level",
help="the level to log at",
choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"],
default="INFO",
)
parser.add_argument("--sample_name", help="sample name")
parser.add_argument("--fasta_file", help="fasta file")
parser.add_argument("--reference_file", help="reference file")
options = parser.parse_args(args)

return options


def calculate_percent_coverage(sample_name, fasta_file, reference_file):
def setup_logging(log_level: str) -> None:
"""Set up logging."""
log_format = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
logging.basicConfig(
level=log_level,
stream=sys.stdout,
format=log_format,
datefmt="%Y-%m-%d %H:%M:%S",
)


def calculate_percent_coverage(
sample_name: str, fasta_file: str, reference_file: str
) -> None:
"""Calculate percent coverage."""
# first check that there is only one sequence in the fasta files
num_records = 0
with open(fasta_file, 'r') as fasta_handle:
with open(fasta_file, "r") as fasta_handle:
inside_text = fasta_handle.read()
for item in re.finditer('>', inside_text):
for _ in re.finditer(">", inside_text):
num_records = num_records + 1

# calculate the percent coverage
ref_genome = SeqIO.read(reference_file, 'fasta')
ref_genome = SeqIO.read(reference_file, "fasta")
ref_genome_length = len(ref_genome.seq)

Ns = 0
coverage = 0.0
aligned_bases = 0
num_non_ambiguous_bases = 0
if num_records == 1:
record = SeqIO.read(fasta_file, 'fasta')
record = SeqIO.read(fasta_file, "fasta")
aligned_bases = len(record.seq)
if aligned_bases == 0:
Ns = ref_genome_length
num_non_ambiguous_bases = 0
coverage = 0
else:
uncorrected_Ns = record.seq.count('N')
uncorrected_Ns = record.seq.count("N")
Ns = (ref_genome_length - aligned_bases) + uncorrected_Ns
num_non_ambiguous_bases = aligned_bases - uncorrected_Ns
coverage = round((1-(Ns/ref_genome_length)) * 100, 2)
coverage = round((1 - (Ns / ref_genome_length)) * 100, 2)
else:
aligned_bases = np.NaN
Ns = np.NaN
num_non_ambiguous_bases = np.NaN
coverage = np.NaN
aligned_bases = np.NaN # type: ignore
Ns = np.NaN # type: ignore
num_non_ambiguous_bases = np.NaN # type: ignore
coverage = np.NaN # type: ignore

# create pd df with calc_percent_cvg
df = pd.DataFrame()
df['sample_name'] = [sample_name]
df['aligned_bases'] = [aligned_bases]
df['N_bases'] = [Ns]
df['non_ambiguous_bases'] = [num_non_ambiguous_bases]
df['percent_coverage'] = [coverage]
df["sample_name"] = [sample_name]
df["aligned_bases"] = [aligned_bases]
df["N_bases"] = [Ns]
df["non_ambiguous_bases"] = [num_non_ambiguous_bases]
df["percent_coverage"] = [coverage]
# df['number_seqs_in_fasta'] = [num_records]

outfile = '%s_consensus_cvg_stats.csv' % sample_name
df.to_csv(outfile, index = False)


if __name__ == '__main__':

options = getOptions()
outfile = "%s_consensus_cvg_stats.csv" % sample_name
df.to_csv(outfile, index=False)


def main(args: argparse.Namespace) -> None:
"""Main function."""
setup_logging(log_level=args.log_level)

calculate_percent_coverage(
sample_name = options.sample_name,
fasta_file = options.fasta_file,
reference_file = options.reference_file
)
sample_name=args.sample_name,
fasta_file=args.fasta_file,
reference_file=args.reference_file,
)


if __name__ == "__main__":
args = parse_args(sys.argv[1:])
main(args=args)
3 changes: 2 additions & 1 deletion scripts/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@


def parse_args(args: list[str]) -> argparse.Namespace:
"""Parses command line arguments."""
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Sequencing results summary.")

parser.add_argument(
"--log_level",
help="the level to log at",
Expand Down

0 comments on commit 48f0d4a

Please sign in to comment.