Skip to content

Commit

Permalink
Merge pull request #14 from AlfonsoJan/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
AlfonsoJan committed Mar 14, 2024
2 parents 1300402 + 38360c6 commit 86f1192
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 34 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to this project will be documented in this file.

## [1.0.5] - 2024-03-14

* Made the rust code faster by not opening the ref genome file for each mutation but read the mutation per chromosome

## [1.0.4] - 2024-02-21

* Fix for the reverse complementary translating
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sbsgenerator"
version = "1.0.4"
version = "1.0.5"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
218 changes: 186 additions & 32 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ use pyo3::types::PyList;
use pyo3::Python;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write};

/// Parses VCF files and generates a NumPy array containing the parsed data.
///
/// # Arguments
///
/// * `py` - A Python interpreter session.
/// * `vcf_files` - A list of paths to VCF files.
/// * `ref_genome` - A string indicating the reference genome.
/// * `ref_genome_path` - A string indicating the reference genome.
/// * `context` - An integer specifying the context size.
///
/// # Returns
Expand All @@ -34,18 +34,19 @@ use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
fn parse_vcf_files(
py: Python,
vcf_files: &PyList,
ref_genome: &str,
ref_genome_path: &str,
context: usize,
) -> PyResult<PyObject> {
// All the collected data from the VCF files
let all_data = read_and_collect_vcf_data(vcf_files, ref_genome, context)?;
let all_data = read_and_collect_vcf_data(vcf_files, ref_genome_path, context)?;
// Reshape the data into a 2D array
let s = all_data.len();
let t = if !all_data.is_empty() {
all_data[0].len()
} else {
0
};
// Convert the data to Python objects
let py_objects: Vec<PyObject> = all_data
.into_iter()
.flatten()
Expand All @@ -63,7 +64,7 @@ fn parse_vcf_files(
/// # Arguments
///
/// * `vcf_files` - A list of paths to VCF files.
/// * `ref_genome` - A string indicating the reference genome.
/// * `ref_genome_path` - A string indicating the reference genome.
/// * `context` - An integer specifying the context size.
///
/// # Returns
Expand All @@ -75,40 +76,173 @@ fn parse_vcf_files(
/// Returns an I/O error if there is an issue reading the VCF files.
fn read_and_collect_vcf_data(
vcf_files: &PyList,
ref_genome: &str,
ref_genome_path: &str,
context: usize,
) -> Result<Vec<Vec<String>>, PyErr> {
let mut all_data = Vec::new();

// Read and collect the data from each VCF file
for vcf_file in vcf_files {
let vcf_file_path: String = vcf_file.extract()?;
let data = read_vcf_file_contents(&vcf_file_path, &ref_genome, &context)?;
let data = read_vcf_file_contents(&vcf_file_path)?;
all_data.extend(data);
}

let genome_chromosome_indices = process_vcf_data(&all_data);
update_all_data_based_on_indices(
&mut all_data,
&genome_chromosome_indices,
ref_genome_path,
context,
)?;
Ok(all_data)
}

/// Reads the contents of a single VCF file and processes the data.
/// Updates the collected data based on chromosome indices to include contextual information.
///
/// # Arguments
/// * `all_data` - Mutable reference to all collected data to be updated.
/// * `genome_chromosome_indices` - Mapping from genome and chromosome to indices in `all_data`.
/// * `ref_genome_path` - Path to the reference genome directory.
/// * `context` - Number of base pairs to include around the mutation for context.
///
/// * `vcf_file` - Path to the VCF file.
/// * `ref_genome` - A string indicating the reference genome.
/// * `context` - An integer specifying the context size.
/// # Returns
/// Ok if the update is successful, Err otherwise.
fn update_all_data_based_on_indices(
all_data: &mut Vec<Vec<String>>,
genome_chromosome_indices: &HashMap<String, HashMap<String, Vec<usize>>>,
ref_genome_path: &str,
context: usize,
) -> Result<(), PyErr> {
let grand_total_indices: usize = genome_chromosome_indices
.values()
.map(|chromosomes_indices| {
chromosomes_indices
.values()
.map(|indices| indices.len())
.sum::<usize>()
})
.sum();
let mut processed_indices_count = 0;
let n = 1000; // Update frequency

for (ref_genome, chromosomes_indices) in genome_chromosome_indices {
for (chromosome, indices) in chromosomes_indices {
for &index in indices {
processed_indices_count += 1;
update_progress(processed_indices_count, grand_total_indices, n);
process_mutation_row(
index,
all_data,
ref_genome,
chromosome,
ref_genome_path,
context,
)?;
}
}
}
println!();
Ok(())
}

/// Processes a specific index in the all_data vector based on the chromosome indices.
///
/// # Arguments
/// * `index` - The index in `all_data` to be processed.
/// * `all_data` - The collected data to be updated.
/// * `ref_genome` - The reference genome identifier.
/// * `chromosome` - The chromosome identifier.
/// * `ref_genome_path` - Path to the reference genome directory.
/// * `context` - Number of base pairs to include around the mutation for context.
///
/// # Returns
/// Ok if the mutation data at the specified index is successfully processed, Err otherwise.
fn process_mutation_row(
index: usize,
all_data: &mut Vec<Vec<String>>,
ref_genome: &str,
chromosome: &str,
ref_genome_path: &str,
context: usize,
) -> Result<(), PyErr> {
if let Some(row) = all_data.get(index) {
let total_path = format!("{}/{}/{}.txt", ref_genome_path, ref_genome, chromosome);
let translate_complement = row[4].parse::<bool>().unwrap();
let position = row[1].parse::<usize>().unwrap();
let (left, right) =
read_bytes_file_contents(&total_path, position, &context, translate_complement)?;
let translated_reference = row[5].clone();
let translated_alternate = row[6].clone();
let new_mutation_type = format!(
"{}[{}>{}]{}",
left, translated_reference, translated_alternate, right
);
all_data[index] = vec![row[0].clone(), new_mutation_type]
.iter()
.map(|s| s.into())
.collect();
}
Ok(())
}

/// Updates the progress of data processing to the console.
///
/// A vector containing the processed data.
/// # Arguments
/// * `processed` - The number of items processed so far.
/// * `total` - The total number of items to process.
/// * `n` - The frequency of progress updates (e.g., update every `n` items).
///
/// # Errors
/// Displays the current progress as a percentage of the total work done.
fn update_progress(processed: usize, total: usize, n: usize) {
let current_percentage = (processed as f64 / total as f64 * 100.0).round() as usize;
if processed % n == 0 || processed == total {
print!(
"\rProcessed: {}% ({}/{})",
current_percentage, processed, total
);
std::io::stdout().flush().unwrap();
}
}

/// Processes the collected VCF data to map genome and chromosomes to indices in the data vector.
///
/// Returns an I/O error if there is an issue reading the VCF file.
fn read_vcf_file_contents(
vcf_file: &str,
ref_genome: &str,
context: &usize,
) -> Result<Vec<Vec<String>>, std::io::Error> {
/// # Arguments
/// * `all_data` - The collected data from VCF files.
///
/// # Returns
/// A mapping from reference genomes and chromosomes to indices in `all_data`.
fn process_vcf_data(all_data: &Vec<Vec<String>>) -> HashMap<String, HashMap<String, Vec<usize>>> {
let mut genome_chromosome_indices: HashMap<String, HashMap<String, Vec<usize>>> =
HashMap::new();
let ref_genome_index = 2;
let chromosome_index = 3;

for (row_index, row) in all_data.iter().enumerate() {
if let (Some(ref_genome), Some(chromosome)) =
(row.get(ref_genome_index), row.get(chromosome_index))
{
genome_chromosome_indices
.entry(ref_genome.clone())
.or_default()
.entry(chromosome.clone())
.or_default()
.push(row_index);
}
}

genome_chromosome_indices
}

/// Reads the contents of a specified VCF file and collects relevant mutation data.
///
/// # Arguments
/// * `vcf_file` - The path to the VCF file to be read.
///
/// # Returns
/// A vector of vectors, where each inner vector contains strings representing collected mutation data.
///
/// # Errors
/// Returns an I/O error if the file cannot be read.
fn read_vcf_file_contents(vcf_file: &str) -> Result<Vec<Vec<String>>, std::io::Error> {
let nucleotides = vec!["A", "C", "G", "T"];
let translate_purine_to_pyrimidine: HashMap<char, char> =
[('A', 'T'), ('G', 'C')].iter().cloned().collect();
Expand All @@ -118,10 +252,8 @@ fn read_vcf_file_contents(
.cloned()
.collect();
let mut data = Vec::new();

let file = File::open(vcf_file)
.map_err(|_| pyo3::exceptions::PyIOError::new_err("Failed to open the VCF file"))?;

for line in BufReader::new(file).lines() {
let line = line.map_err(|_| {
pyo3::exceptions::PyIOError::new_err("Error reading line from the VCF file")
Expand All @@ -131,9 +263,9 @@ fn read_vcf_file_contents(
if fields.len() >= 10 {
let reference_genome = fields[3];
let mutation_type = fields[4];
let chromosome = fields[5];
let reference_allele = fields[8];
let alternate_allele = fields[9];


let translated_alternate: String = if (reference_allele == "A"
|| reference_allele == "G")
Expand All @@ -157,18 +289,19 @@ fn read_vcf_file_contents(
&& (translated_reference != translated_alternate)
{
let position = fields[6].parse::<usize>().unwrap() - 1;
let total_path = format!("{}/{}/{}.txt", ref_genome, reference_genome, fields[5]);
let (left, right) = read_bytes_file_contents(&total_path, position, context, translate_complement)?;
let sample = format!("{}::{}", fields[0], fields[1]);
let new_mutation_type = format!(
"{}[{}>{}]{}",
left, translated_reference, translated_alternate, right
);
data.push(vec![sample, new_mutation_type]);
data.push(vec![
sample,
position.to_string(),
reference_genome.to_string(),
chromosome.to_string(),
translate_complement.to_string(),
translated_reference,
translated_alternate,
]);
}
}
}

Ok(data)
}

Expand Down Expand Up @@ -247,16 +380,37 @@ fn read_bytes_file_contents(
Ok((left, right))
}

/// Generates the reverse complement of a nucleotide sequence.
///
/// # Arguments
/// * `nucleotide_str` - The nucleotide sequence for which to generate the reverse complement.
///
/// # Returns
/// A string representing the reverse complement of the input sequence.
fn reverse_complement(nucleotide_str: &str) -> String {
let reversed = reverse_string(nucleotide_str);
let complemented: String = reversed.chars().map(complement).collect();
complemented
}

/// Reverses a string.
///
/// # Arguments
/// * `s` - The string to be reversed.
///
/// # Returns
/// The reversed string.
fn reverse_string(s: &str) -> String {
s.chars().rev().collect()
}

/// Returns the complement of a given nucleotide.
///
/// # Arguments
/// * `nucleotide` - The nucleotide (A, T, C, or G) for which to find the complement.
///
/// # Returns
/// The complement nucleotide (A<>T, C<>G).
fn complement(nucleotide: char) -> char {
match nucleotide {
'A' => 'T',
Expand Down

0 comments on commit 86f1192

Please sign in to comment.