In [2]:
# Constants, the file defining the 
CTCF_CLUS_FILE_PATH = "../analysis_results/ctcf.CLUS"
CAGE_CLUS_FILE_PATH = "../analysis_results/cage.CLUS"

In [3]:
from src.clus_files_io import parse_clus_file
from src.utils import sort_by_chromosome

### The clusters
ctcf_clusters = parse_clus_file(CTCF_CLUS_FILE_PATH)
cage_clusters = parse_clus_file(CAGE_CLUS_FILE_PATH)

In [22]:
# plot the data using plotly
import plotly.graph_objects as go

In [44]:
### Plot the number of clusters per chromosome (an histogram)
ctcf_counts = {chrom: len(clusters) for chrom, clusters in ctcf_clusters.items()}
cage_counts = {chrom: len(clusters) for chrom, clusters in cage_clusters.items()}

### Sort the chromosomes
ctcf_counts = sort_by_chromosome(ctcf_counts)
cage_counts = sort_by_chromosome(cage_counts)

fig = go.Figure()
fig.add_trace(go.Bar(x=list(cage_counts.keys()), y=list(cage_counts.values()), name="CAGE"))
fig.add_trace(go.Bar(x=list(ctcf_counts.keys()), y=list(ctcf_counts.values()), name="CTCF"))
fig.update_layout(barmode='group', title="Number of clusters per chromosome")
fig.show()

# Save the figure as a svg file
fig.write_image("../analysis_results/cluters_counts.svg")

# Cluster Morphology

In [38]:
from src.utils import RESOLUTIONS as RESOLUTIONS_VALUES

def _extract_resolution(cluster_name : str):
    """Extract the resolution from the cluster name"""
    return (cluster_name.split("_")[0])

CAGE_RESOLUTIONS = {}
CTCF_RESOLUTIONS = {}

for chrom, clusters in cage_clusters.items():
    for cluster in clusters:
        resolution = _extract_resolution(cluster)
        if resolution not in CAGE_RESOLUTIONS:
            CAGE_RESOLUTIONS[resolution] = 0
        CAGE_RESOLUTIONS[resolution] += 1
    
for chrom, clusters in ctcf_clusters.items():
    for cluster in clusters:
        resolution = _extract_resolution(cluster)
        if resolution not in CTCF_RESOLUTIONS:
            CTCF_RESOLUTIONS[resolution] = 0
        CTCF_RESOLUTIONS[resolution] += 1

# Sort both dictionaries by resolution
CAGE_RESOLUTIONS = dict(sorted(CAGE_RESOLUTIONS.items(), key=lambda x: RESOLUTIONS_VALUES[x[0]]))
CTCF_RESOLUTIONS = dict(sorted(CTCF_RESOLUTIONS.items(), key=lambda x: RESOLUTIONS_VALUES[x[0]]))

In [40]:
#Plot them 
fig = go.Figure()
fig.add_trace(go.Bar(x=list(CAGE_RESOLUTIONS.keys()), y=list(CAGE_RESOLUTIONS.values()), name="CAGE"))
fig.update_layout(barmode='group', title="Number of clusters per resolution")
fig.show()

#  Save the figure as a svg file
fig.write_image("../analysis_results/cage_cluters_resolutions.svg")

In [45]:
#Plot them 
fig = go.Figure()
fig.add_trace(go.Bar(x=list(CTCF_RESOLUTIONS.keys()), y=list(CTCF_RESOLUTIONS.values()), name="CTCF"))
fig.update_layout(barmode='group', title="Number of clusters per resolution")
fig.show()

#  Save the figure as a svg file
fig.write_image("../analysis_results/ctcf_cluters_resolutions.svg")

# Enrichment Data.

In [69]:
ENRICHMENT_DATA = "../analysis_results/enrichment_results.tsv"

# read the enrichment data
import pandas as pd
import math
enrichment_data = pd.read_csv(ENRICHMENT_DATA, sep="\t")

In [84]:
# Plot the enrichment data
fig = go.Figure()

# Extract the column gene_set and q_value and plot the negative log10 of q_value
q_values = list(enrichment_data["q_value"].apply(lambda x: -math.log10(x)).values)
genes_set = enrichment_data[["gene_set"]].values.flatten().tolist()
enrich_dict = dict(zip(genes_set, q_values))

# take just the first 10
enrich_dict = dict(list(enrich_dict.items())[:10])

# PLot the data scatter
fig.add_trace(go.Scatter(x=list(enrich_dict.keys()), y=list(enrich_dict.values()), mode="markers"))
fig.update_layout(title="Enrichment data")
fig.show()

# save the figure as a svg file
fig.write_image("../analysis_results/enrichment_data.svg")

# Cage enrichment

In [12]:
import json

CLUSTERS_PATH = "../data/clusters/HMEC/"
CHROMOSOMES = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22"]

CLUSTER_COUNTS = {}

for chromo in CHROMOSOMES:
    with open(f"{CLUSTERS_PATH}{chromo}_spec_res.json", "r") as f:
        data = json.load(f)
        CLUSTER_COUNTS[chromo] = len(data["cl_member"])

In [13]:
CLUSTER_COUNTS

{'chr1': 45046,
 'chr2': 46806,
 'chr3': 39706,
 'chr4': 35670,
 'chr5': 34462,
 'chr6': 33820,
 'chr7': 28442,
 'chr8': 27214,
 'chr9': 21900,
 'chr10': 24744,
 'chr11': 25732,
 'chr12': 26878,
 'chr13': 18182,
 'chr14': 16916,
 'chr15': 14868,
 'chr16': 13554,
 'chr17': 15226,
 'chr18': 14524,
 'chr19': 10972,
 'chr20': 12736,
 'chr21': 5688,
 'chr22': 6038}

In [15]:
cage_enriched_counts = {chrom: len(clusters) for chrom, clusters in cage_clusters.items()}
cage_enriched_counts # How many clusters are there per chromosome?

{'chr10': 398,
 'chr11': 490,
 'chr12': 526,
 'chr13': 223,
 'chr14': 347,
 'chr15': 306,
 'chr16': 210,
 'chr17': 344,
 'chr18': 178,
 'chr19': 420,
 'chr1': 779,
 'chr20': 209,
 'chr21': 121,
 'chr22': 123,
 'chr2': 661,
 'chr3': 521,
 'chr4': 365,
 'chr5': 478,
 'chr6': 565,
 'chr7': 444,
 'chr8': 320,
 'chr9': 331}

In [29]:
CAGE_RATIO = {chrom : cage_enriched_counts[chrom] / CLUSTER_COUNTS[chrom] for chrom in CHROMOSOMES}

#plot an histogram
fig = go.Figure()
fig.add_trace(go.Bar(x=list(CAGE_RATIO.keys()), y=list(CAGE_RATIO.values()), name="CAGE"))
fig.update_layout(barmode='group', title="Ratio of clusters per chromosome")
# Add an horizontal line with the average
avg = sum(CAGE_RATIO.values()) / len(CAGE_RATIO)
fig.add_shape(
    type="line",
    x0=list(CAGE_RATIO.keys())[0],
    y0=avg,
    x1=list(CAGE_RATIO.keys())[-1],
    y1=avg,
    line=dict(
        color="red",
        width=2,
        dash="dash"
    )
)
fig.show()

# save the figure as a svg file
fig.write_image("../analysis_results/cage_ratio.svg")

In [34]:
# Draw a pie diagram of the following data:
DATA = {"Hey" : 200, "A" : 1000}

fig = go.Figure()
fig.add_trace(go.Pie(labels=list(cage_counts.keys()), values=list(cage_counts.values())))
fig.show()

# save the figure as a svg file
fig.write_image("../analysis_results/cage_pie.svg")

In [42]:
GENOME_FILE = "../data/hg19.genome"
import pandas as pd
_genome_file = pd.read_csv(GENOME_FILE, sep="\t", header=None)
_genome_file.columns = ["chrom", "size"]
_genome_dict = _genome_file.set_index("chrom").to_dict()["size"]