## Description

Simulate reads with SNPs, indels and sequencing errors.

## Data & modules

In [None]:
from Bio import SeqIO, AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy.random as random
from scipy.stats import geom, binom, poisson
import numpy as np
from copy import deepcopy
from pysam import VariantFile, FastaFile


In [2]:
seed = 42  
random.seed(seed)
np.random.seed(seed)

NameError: name 'random' is not defined

In [129]:
genes = ['PKD1'] + ['PKD1P%i' % i for i in range(1, 7)]
NUCL = ['A', 'C', 'T', 'G']
NUCL_extd = NUCL + ['N']

In [130]:
refseq_list = list(SeqIO.parse('reference_seqs.fa', 'fasta'))
refseq_dict = {s.id: s for s in refseq_list}

In [131]:
refseq_lens = [len(refseq_dict[g]) for g in genes]
refseq_len_dict = {g: len(refseq_dict[g]) for g in genes}

## Parameters

In [132]:
# Genomic
SNP_prob = 0.001
INDEL_prob = 0.0001  # joint for insertions and deletions
INDEL_len_mean = 2
# Sequencing
ERR_prob = 0.01
FRAGMENT_LEN_mean = 350

## VCF file setup

In [133]:
# Create a new VCF file with header
vcf_header = VariantFile("reference_mutations.vcf", 'w')

# Set up the header
header = vcf_header.header

# Add the required fields to header to match FreeBayes format
header.add_line('##fileformat=VCFv4.2')
header.add_line('##FILTER=<ID=PASS,Description="All filters passed">')
header.add_line('##reference=refseq.fa')

# Add contigs to header
for gname, length in refseq_len_dict.items():
    header.add_line(f'##contig=<ID={gname},length={length}>')

# Open the reference
reference = FastaFile("reference_seqs.fa")

def create_vcf_record(vcf_header, gname, pos, ref, alt, mut_type):
    # Helper function for VCF writing - simplified format
    record = vcf_header.new_record()
    record.chrom = gname
    record.pos = pos + 1  # VCF is 1-based
    record.id = None
    record.ref = ref
    record.alts = (alt,)
    record.qual = None
    record.filter.add('PASS')
    vcf_header.write(record)

## Mutate the reference genome, track mutations and write VCF file

In [None]:
# Mutate the reference genome, track mutations and write VCF file
mutseq_list = []
for gname in genes:
    print('Gene', gname)
    gseq = refseq_dict[gname]
    print('Initial length:', len(gseq))
    
    # Store original sequence for reference
    ref_seq = str(gseq.seq)
    # Convert the gene sequence to a list for mutability
    gseq = list(gseq.seq)
    
    # Add insertions
    insert_nb = binom.rvs(n=refseq_len_dict[gname], p=INDEL_prob/2)
    for _ in range(insert_nb):
        insert_loc = rd.choice(range(len(gseq)), replace=False)
        insert_size = max(1, geom.rvs(p=1/INDEL_len_mean))
        insert_seq = rd.choice(NUCL, size=insert_size)
        
        # FreeBayes format: Include preceding base in both REF and ALT
        if insert_loc > 0:
            preceding_base = ref_seq[insert_loc-1]
            ref = preceding_base
            alt = preceding_base + ''.join(insert_seq)
            create_vcf_record(vcf_header, gname, insert_loc-1, ref, alt, 'INS')
        else:
            # If at position 0, use standard format
            ref_base = ref_seq[insert_loc]
            alt = ref_base + ''.join(insert_seq)
            create_vcf_record(vcf_header, gname, insert_loc, ref_base, alt, 'INS')
            
        gseq[insert_loc:insert_loc] = insert_seq
    print('Length after insertions:', len(gseq))
    
    # Add deletions
    delete_nb = binom.rvs(n=refseq_len_dict[gname], p=INDEL_prob/2)
    for _ in range(delete_nb):
        delete_loc = rd.choice(range(len(gseq) - 1))
        delete_size = max(1, min(geom.rvs(p=1/INDEL_len_mean), len(gseq) - delete_loc - 1))
        
        # FreeBayes format: Position is reported as the base before the deletion
        # and both REF and ALT include this base
        if delete_loc > 0 and delete_loc + delete_size < len(ref_seq):
            delete_pos = delete_loc - 1
            # Get the preceding base plus the deleted sequence
            ref = ref_seq[delete_pos:delete_pos + delete_size + 1]
            # ALT is just the preceding base
            alt = ref_seq[delete_pos]
            
            # Only create record if ref and alt are different
            if len(ref) > len(alt):
                create_vcf_record(vcf_header, gname, delete_pos, ref, alt, 'DEL')
                gseq[delete_loc:delete_loc + delete_size] = []
        elif delete_loc + delete_size < len(ref_seq):
            # Handle case where deletion is at the start
            ref = ref_seq[delete_loc:delete_loc + delete_size + 1]
            alt = ref_seq[delete_loc]
            if len(ref) > len(alt):
                create_vcf_record(vcf_header, gname, delete_loc, ref, alt, 'DEL')
                gseq[delete_loc:delete_loc + delete_size] = []
    print('Length after deletions:', len(gseq))
    
    # Add SNPs
    is_mutated = rd.choice(2, size=len(gseq), p=[1-SNP_prob, SNP_prob])
    for pos, (should_mutate, ref_base) in enumerate(zip(is_mutated, ref_seq)):
        if should_mutate and pos < len(gseq):
            possible_bases = [b for b in NUCL if b != ref_base]
            if possible_bases:
                new_base = rd.choice(possible_bases)
                create_vcf_record(vcf_header, gname, pos, ref_base, new_base, 'SNP')
                gseq[pos] = new_base
    
    # Create a SeqRecord object and append
    gseq = ''.join(gseq)
    gseq = Seq(gseq)
    gseq = SeqRecord(gseq, id=gname, description='', name='')
    mutseq_list.append(gseq)

vcf_header.close()
reference.close()

## Sort the VCF file by gene and position, and validate and normalise its format using bcftools norm command. -f compares it and validates against the reference, - c w outputs any warnings and errors

In [None]:
!bcftools sort reference_mutations.vcf | bcftools norm -f  reference_seqs.fa -c w -o reference_mutations_normalised.vcf

In [136]:
mutseq_lens = [len(s) for s in mutseq_list]

## Simulate reads
Using mutseq_list as reference

In [137]:
N_reads = 100000

In [138]:
r1_list = []
r2_list = []
# Select genes to simulate reads from 
selected_genes = rd.choice(len(genes), size=N_reads)  
start_locations = []
for rid, gid in enumerate(selected_genes):
    # Sample the DNA fragment length
    # We use a shifted geometric distribution with minimum value = 150, mean value = FRAGMENT_LEN_mean
    FRAGLEN = 150 + geom.rvs(p=1/(FRAGMENT_LEN_mean-150))
    # Choose the read starting location. We just sample uniformly, this could be made 
    # more realistically but whatever. Note that this depends on FRAGLEN.
    START = rd.choice(mutseq_lens[gid]-FRAGLEN+1)
    read_description = genes[gid] + '; ' + str(START)+'-'+str(START+FRAGLEN)
    # Take the genome sequences, add sequencing errors.
    # r1 is the forward read, r2 is the backward read
    r1 = list(mutseq_list[gid][START:(START+150)])
    is_mutated1 = rd.choice(2, size=len(r1), p=[1-ERR_prob, ERR_prob])
    r1 = [rd.choice(NUCL) if mut else c for c,mut in zip(r1, is_mutated1)]
    r1 = SeqRecord(Seq(''.join(r1)), id = 'Read'+str(rid), name='', description = read_description)
    r1.letter_annotations['phred_quality'] = [42]*150
    assert len(r1) == 150
    # Note: we reverse complement r2
    r2 = list(mutseq_list[gid][(START+FRAGLEN-150):(START+FRAGLEN)].reverse_complement())
    is_mutated2 = rd.choice(2, size=len(r2), p=[1-ERR_prob, ERR_prob])
    r2 = [rd.choice(NUCL) if mut else c for c,mut in zip(r2, is_mutated2)]
    r2 = SeqRecord(Seq(''.join(r2)), id = 'Read'+str(rid), name='', description = read_description)
    r2.letter_annotations['phred_quality'] = [42]*150
    assert len(r2) == 150
    r1_list.append(r1)
    r2_list.append(r2)

## Save the reads and SNP locations

In [139]:
with open('simulated_r1.fq', 'w') as h:
    SeqIO.write(r1_list, h, 'fastq')
with open('simulated_r2.fq', 'w') as h:
    SeqIO.write(r2_list, h, 'fastq')