In [22]:
from pyjaspar import jaspardb
import pyjaspar.jaspar
import requests
import xml.etree.ElementTree as ET
from src.cluster_description import Cluster, ClustersDescription
import os

def extract_sequence_from_position(chromosome : str, start : int, end : int) -> str:
    
    """
    Returns the sequence from the given position in the given chromosome using UCSC.
    """
    
    url = "https://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment={}:{},{}".format(chromosome, start, end)
    response = requests.get(url).text
    
    # Decode the response as XML
    root = ET.fromstring(response)
    
    # Print all the elements in the XML
    for element in root.iter():
        if element.tag == "DNA":
            return element.text.strip()

def _cache_file_from_position(chromosome : str, start : int, end : int) -> str:    
    return "cache/{}_{}_{}.fa".format(chromosome, start, end)

class SequenceUtils:

    @staticmethod
    def get_coordinates(chromosome : str, start : int, end : int) -> tuple[str, bool]:
        """ Checks if the given coordinates are cached and returns the sequence if they are, otherwise it downloads the sequence from UCSC and caches it. 
        Returns a tuple with the sequence and a boolean indicating whether the sequence was cached or not. """

        # Check if the coordinates are cached
        cached_file = _cache_file_from_position(chromosome, start, end)

        #Check that the cache directory exists
        if not os.path.exists("cache"):
            os.makedirs("cache")

        # Try to retrieve it.
        if os.path.isfile(cached_file):
            # If they are, return the sequence
            with open(cached_file, "r") as f:
                seq = f.read().strip().replace("\n", "")
            return seq, True
        else:
            # If they are not, download the sequence and cache it
            sequence = extract_sequence_from_position(chromosome, start, end)
            with open(cached_file, "w") as f:
                f.write(sequence)
            return sequence, False

    @staticmethod
    def query_human_motifs():
        jaspar_database = jaspardb(release="JASPAR2022")
        motifs = jaspar_database.fetch_motifs(species=["9606"])
        return motifs

    @staticmethod
    def query_cluster_sequence(chromosome : str, cluster : Cluster) -> list[str]:
        out = []

        print("The following action will generate approximately {:.1f} MB of data. Please wait...".format(cluster.total_length / (1024 * 1024)))
        cached_num = 0

        for start, end in cluster:
            seq, cached = SequenceUtils.get_coordinates(chromosome, start, end)
            if cached:
                cached_num += 1
            out.append(seq)

        print("{} sequences were cached.".format(cached_num))
    
        return out
    
    @staticmethod
    def clear_cache():
        if os.path.exists("cache"):
            for file in os.listdir("cache"):
                os.remove(os.path.join("cache", file))
            os.rmdir("cache")

    @staticmethod
    def count_motif(motif : str, sequence : str) -> int:
        """ Counts the number of times the given motif occurs in the given sequence. """

        # Count the number of times the motif occurs in the sequence
        return sequence.count(motif)

    @staticmethod
    def count_all_motifs(sequence : str) -> dict[str, int]:
        """ Counts the number of times all the motifs occur in the given sequence. """

        # Count the number of times all the motifs occur in the sequence
        motifs = SequenceUtils.query_human_motifs()
        out = {}
        for motif in motifs:
            out[motif.] = SequenceUtils.count_motif(motif.name, sequence)
        return out

In [23]:
chromo = "chr16"
clusters = ClustersDescription("../data/clusters/HMEC/chr16_spec_res.json", chromo)
CLUSTER_NAME = "50kb_13_78_85050000_85650000"

In [24]:
sequences = SequenceUtils.query_cluster_sequence(chromo, clusters[CLUSTER_NAME])
full_sequence = "".join(sequences)
print(len(full_sequence))

The following action will generate approximately 0.6 MB of data. Please wait...
13 sequences were cached.
650013


In [25]:
SequenceUtils.clear_cache()