# Description

In [1]:
#1 get GC content from contigs
    #get GC content in set chunk sizes
    
#2 get coverage for contigs
    #get coverage in chunk sizes

#3 blob plot gc vs average coverage

#4 plot gc content/coverage for each contig (if significant in some way)

#5 kmeans clusters

In [2]:
import pandas as pd
import numpy as np
import csv
import os

# Parameters

In [3]:
assembly = "./data/final.p_ctg_test.fasta"
coverage_per_contig = "./data/heliopora_coerulea_hifi_2_p_assembly.coverage"
coverage_per_base = "./data/heliopora_coerulea_hifi_2_p_assembly_test.depth"


# Naive GC content for each content

In [4]:
def get_gc_content(fasta_file):
    """
    Reads in a FASTA file and returns a dictionary mapping contig IDs to their GC content
    """
    gc_content = {}
    current_id = None
    current_seq = ""
    
    with open(fasta_file, "r") as f:
        for line in f:
            if line.startswith(">"):
                # If this is a new contig, calculate the GC content for the previous one (if there was one)
                if current_id is not None:
                    gc_content[current_id] = (current_seq.count("G") + current_seq.count("C")) / len(current_seq)
                
                # Start the new contig
                current_id = line.strip()[1:]
                current_seq = ""
            else:
                current_seq += line.strip()
    
    # Calculate GC content for the final contig
    gc_content[current_id] = (current_seq.count("G") + current_seq.count("C")) / len(current_seq)
    
    return gc_content

In [5]:
naive_gc = get_gc_content(assembly)
print(naive_gc)

{'ctg/p/l/000409/0': 0.4361125376490347, 'ctg/p/l/000412/0': 0.3682002202177709}


# GC content for each contig by averaging in given chunk sizes

In [26]:
def get_gc_content_chunks(fasta_file, chunk_size):
    """
    Reads in a FASTA file and returns a dictionary mapping contig IDs to a list of gc content per chunk size
    """
    gc_content = {}
    current_id = None
    current_seq = ""
    
    with open(fasta_file, "r") as f:
        for line in f:
            if line.startswith(">"):
                chunks = []
                # If this is a new contig, calculate the GC content for the previous one (if there was one)
                if current_id is not None:
                    i = 0
                    while i < len(current_seq):
                        
                        #check for last
                        if (i + chunk_size) > len(current_seq):
                            end = len(current_seq)
                        else:
                            end = i + chunk_size
                        sub = current_seq[i:end]  
                        #print("chunk length is " + str(i) + " : " + str(end))
                        gc = (sub.count("G") + sub.count("C")) / chunk_size
                        chunks.append(gc)
                        #add to dictionary in form {current_id : [array of GC content of given chunk size]}
                        gc_content[current_id] = chunks
                        i = end
                
                # Start the new contig
                current_id = line.strip()[1:]
                current_seq = ""
            else:
                current_seq += line.strip()
                print("length of current sequence " + str(len(current_seq)))
                print(round(len(current_seq)/chunk_size))
        
    return gc_content

In [25]:
chunks = get_gc_content_chunks(assembly, 10000)
print(chunks)

length of current sequence 360235
36
length of current sequence 359644
36
length of current sequence 359644
36
{'ctg/p/l/000409/0': [0.4599, 0.404, 0.3717, 0.4597, 0.4598, 0.4354, 0.3869, 0.4369, 0.4386, 0.4352, 0.4407, 0.4418, 0.4431, 0.4429, 0.4387, 0.4448, 0.4506, 0.4588, 0.4525, 0.3831, 0.401, 0.4619, 0.4554, 0.4494, 0.3779, 0.4361, 0.4385, 0.4445, 0.4427, 0.4433, 0.4413, 0.4394, 0.4441, 0.4481, 0.4441, 0.4473, 0.0102]}
