# Spatial Correlation Plotter

#### Note: Use epi-paint kernel. 

What does this do? 
- Builds a scatter plot of the data taking the DoC value of each localization into consideration. 

In [None]:
# Import Dependencies

import os as _os
import os.path as _ospath
import numpy as _np
import pandas as _pd
import h5py as _h5py
import yaml as _yaml
from PyQt5.QtWidgets import QMessageBox as _QMessageBox
import matplotlib.pyplot as _plt
import seaborn as _sns
import itertools
from tqdm import tqdm
from matplotlib.colors import LogNorm
from matplotlib.colors import LinearSegmentedColormap, to_hex

In [None]:
# Define the folder with the data and the correlation files.
folder = '' # Folder name for specific cell.
min_radius = 100
step_size = 100
max_radius = 1000

# Define the data folder and the correlation data folder
data_folder = _ospath.join(folder, 'Masked')
correlation_data_folder = _ospath.join(folder, 'Analysis', 'Correlations', str(min_radius) + '_' + str(step_size) + '_' + str(max_radius))

#Define the data files and the correlation files.
data_file_extn = '.hdf5'
correlation_file_extn = '.csv'
data_files = [f for f in _os.listdir(data_folder) if f.endswith(data_file_extn)]
correlation_files = [f for f in _os.listdir(correlation_data_folder) if f.endswith(correlation_file_extn)]
pixel_size = 130

# Define and make the output folder.capitalize
output_folder = _ospath.join(folder, 'Analysis', 'Correlations', 'Plots' + '_' + str(min_radius) + '_' + str(step_size) + '_' + str(max_radius))
if not _ospath.exists(output_folder):
    _os.makedirs(output_folder)

In [None]:
cmap_proteins_white = {
    'S2P': LinearSegmentedColormap.from_list('S2P', ['#FFFFFF', '#FF0000']),
    'S5P': LinearSegmentedColormap.from_list('S5P', ['#FFFFFF', '#FFAA00']),
    'SC35': LinearSegmentedColormap.from_list('SC35', ['#FFFFFF', '#AAFF00']),
    'H3K4me3': LinearSegmentedColormap.from_list('H3K4me3', ['#FFFFFF', '#00FF00']),
    'H3K27ac': LinearSegmentedColormap.from_list('H3K27ac', ['#FFFFFF', '#00FFAA']),
    'CTCF': LinearSegmentedColormap.from_list('CTCF', ['#FFFFFF', '#00AAFF']),
    'H3K27me3': LinearSegmentedColormap.from_list('H3K27me3', ['#FFFFFF', '#0000FF']),
    'H3K9me3': LinearSegmentedColormap.from_list('H3K9me3', ['#FFFFFF', '#AA00FF']),
    'Lamin': LinearSegmentedColormap.from_list('Lamin', ['#FFFFFF', '#FF00AA']), 
}

cmap_proteins_black = {
    'S2P': LinearSegmentedColormap.from_list('S2P', ['#000000', '#FF0000']),
    'S5P': LinearSegmentedColormap.from_list('S5P', ['#000000', '#FFAA00']),
    'SC35': LinearSegmentedColormap.from_list('SC35', ['#000000', '#AAFF00']),
    'H3K4me3': LinearSegmentedColormap.from_list('H3K4me3', ['#000000', '#00FF00']),
    'H3K27ac': LinearSegmentedColormap.from_list('H3K27ac', ['#000000', '#00FFAA']),
    'CTCF': LinearSegmentedColormap.from_list('CTCF', ['#000000', '#00AAFF']),
    'H3K27me3': LinearSegmentedColormap.from_list('H3K27me3', ['#000000', '#0000FF']),
    'H3K9me3': LinearSegmentedColormap.from_list('H3K9me3', ['#000000', '#AA00FF']),
    'Lamin': LinearSegmentedColormap.from_list('Lamin', ['#000000', '#FF00AA']), 
}

In [None]:
# Functions used here

def load_locs(path, qt_parent=None):
    with _h5py.File(path, "r") as locs_file:
        locs = locs_file["locs"][...]
    locs = _np.rec.array(
        locs, dtype=locs.dtype
    )  # Convert to rec array with fields as attributes
    info = load_info(path, qt_parent=qt_parent)
    return locs, info

class NoMetadataFileError(FileNotFoundError):
    pass

def load_info(path, qt_parent=None):
    path_base, path_extension = _ospath.splitext(path)
    filename = path_base + ".yaml"
    try:
        with open(filename, "r") as info_file:
            info = list(_yaml.load_all(info_file, Loader=_yaml.UnsafeLoader))
    except FileNotFoundError as e:
        print("\nAn error occured. Could not find metadata file:\n{}".format(filename))
        if qt_parent is not None:
            _QMessageBox.critical(
                qt_parent,
                "An error occured",
                "Could not find metadata file:\n{}".format(filename),
            )
        raise NoMetadataFileError(e)
    return info

def spatial_data_mask(data, center, window):
    mask = (data[:, 0] > center[0] - (window[0]/2)) & (data[:, 0] < center[0] + (window[0]/2)) & (data[:, 1] > center[1] - (window[1]/2)) & (data[:, 1] < center[1] + (window[1]/2))
    data = data[mask]
    return data

def plot_kde_DoC_in_window(folder, data_2, corr_2, center, window, protein_1, protein_2, threshold):
    mask_window_2 = (data_2[:, 0] > center[0] - (window[0]/2)) & (data_2[:, 0] < center[0] + (window[0]/2)) & (data_2[:, 1] > center[1] - (window[1]/2)) & (data_2[:, 1] < center[1] + (window[1]/2))
    corr_2_window_masked = corr_2[mask_window_2]
    fig, ax = _plt.subplots(figsize = (5, 1.5))
    _sns.kdeplot(corr_2_window_masked, fill=False, ax = ax, color = to_hex(cmap_proteins_white[protein_2](1.0)), linewidth = 2)
    line = ax.lines[0]
    x, y = line.get_data()
    mask = x > threshold
    ax.fill_between(x[mask], y[mask], color = to_hex(cmap_proteins_white[protein_2](1.0)), alpha = 0.5)
    ax.set_yticks([0.0, 0.5, 1.0])
    _plt.ylim(0,1.5)
    _plt.xlim(-1.5, 1.5)
    _plt.savefig(_ospath.join(folder, protein_2 + '_from_' + protein_1 + '_fraction_kde.svg'), format = 'svg', bbox_inches = 'tight')
    _plt.show()
    print(f'Percentage of points above threshold = {len(corr_2_window_masked[corr_2_window_masked > threshold])/len(corr_2_window_masked)*100:.2f}%')

def plot_spatial_corr_map(data_1, data_2, corr_1, corr_2, protein_1, protein_2, threshold, folder, color, zoom, center, window, reference_alpha):
    data_1_size = len(data_1)
    data_2_size = len(data_2)
    smaller_size = min(data_1_size, data_2_size)

    # Filter only the positive correlations
    mask_1 = _np.isfinite(corr_1) & (corr_1 > 0)
    mask_2 = _np.isfinite(corr_2) & (corr_2 > threshold) # Threshold to plot positive correlations.
    data_1_masked = data_1[mask_1]
    data_2_masked = data_2[mask_2]
    corr_1_masked = corr_1[mask_1]
    corr_2_masked = corr_2[mask_2]

    # Filter data within the window only if zoom in True
    if zoom:
        xlim = (center[0] - (window[0]/2), center[0] + (window[0]/2))
        ylim = (center[1] - (window[1]/2), center[1] + (window[1]/2))
        folder = _ospath.join(folder, 'Zoomed')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
        plot_kde_DoC_in_window(folder, data_2, corr_2, center, window, protein_1, protein_2, threshold)
        
    else:
        folder = _ospath.join(folder, 'Full_FOV')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
    if color == 'black':
        cmap = cmap_proteins_black[protein_2]
        reference_color = 'white'
        if zoom:
            alpha_constant = 0.5 # For Zoomed FoV
        else: 
            alpha_constant = 0.1 # For Full FoV
    elif color == 'white':
        cmap = cmap_proteins_white[protein_2]
        reference_color = 'black'
        if zoom:
            alpha_constant = 0.7 # For Zoomed FoV; Mock = 1; ActD = 0.7
        else:
            alpha_constant = 0.1 # For Full FoV
        
    else:
        raise ValueError("Invalid color choice. Choose either 'black' or 'white'.")

    # Generate alpha values with respect to the size of the data (number of localizations)
    size = 20
    alpha_1 = alpha_constant * (smaller_size/data_1_size)
    alpha_2 = alpha_constant * (smaller_size/data_2_size)
    
    fig, ax = _plt.subplots(figsize = (5, 5))
    ax.set_facecolor(color)
    sc1 = ax.scatter(data_1[:, 0], data_1[:, 1], alpha = reference_alpha, c = reference_color, s = size, marker = '.', edgecolors = 'none')
    if zoom:
        data_window_masked = spatial_data_mask(data_1, center, window)
        # _sns.kdeplot(x = data_window_masked[:, 0], y = data_window_masked[:, 1], fill = True, thresh = 0.65, cmap = 'Grays', alpha = 0.3)
        # _sns.histplot(x = data_window_masked[:, 0], y = data_window_masked[:, 1], bins = 25, pthresh = 0.6, cmap = 'Grays', alpha = 0.8)
    # _sns.histplot(x = data_1[:, 0], y = data_1[:, 1], bins = 250, pthresh = .7, cmap = 'mako_r')
    # _sns.kdeplot(x = data_1[:, 0], y = data_1[:, 1], fill = True, thresh = 0.7, cmap = 'mako', alpha = 0.5)
    sc2 = ax.scatter(data_2_masked[:, 0], data_2_masked[:, 1], alpha = alpha_2, c = corr_2_masked, s = size * 2, cmap = cmap, marker = '.', edgecolors = 'none' )
    
    # fig.colorbar(sc2, label='Correlation')
    _plt.axis('equal')
    if zoom: 
        _plt.xlim(xlim)
        _plt.ylim(ylim)
        ax.set_aspect('equal', adjustable='box')
    _plt.gca().invert_yaxis()
    # _plt.axis('off') # This doesn't work with black background. Hence, ticks are hidden explicitly in the next lines. 
    ax.set_xticks([])
    ax.set_yticks([])
    ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
    for spine in ax.spines.values():
        spine.set_visible(False)
    # _plt.title(f'{protein_1} and {protein_2} spatial correlation map')
    _plt.savefig(_ospath.join(folder, protein_2 + '_from_' + protein_1 + '.png'), format = 'png', dpi = 400, bbox_inches = 'tight')
    # _plt.savefig(_ospath.join(folder, protein_2 + '_from_' + protein_1 + '.svg'), format = 'svg', bbox_inches = 'tight')
    _plt.show()

def plot_spatial_corr_map_three_targets(data_1, data_2, data_3, corr_1, corr_2, corr_3, protein_1, protein_2, protein_3, folder, color, zoom, center, window, reference_alpha):
    data_1_size = len(data_1)
    data_2_size = len(data_2)
    data_3_size = len(data_3)
    smaller_size = min(data_2_size, data_3_size)
    mask_1 = _np.isfinite(corr_1) & (corr_1 > 0)
    mask_2 = _np.isfinite(corr_2) & (corr_2 > 0)
    mask_3 = _np.isfinite(corr_3) & (corr_3 > 0)
    data_1_masked = data_1[mask_1]
    data_2_masked = data_2[mask_2]
    data_3_masked = data_3[mask_3]
    corr_1_masked = corr_1[mask_1]
    corr_2_masked = corr_2[mask_2]
    corr_3_masked = corr_3[mask_3]

    if zoom:
        xlim = (center[0] - (window[0]/2), center[0] + (window[0]/2))
        ylim = (center[1] - (window[1]/2), center[1] + (window[1]/2))
        folder = _ospath.join(folder, 'Zoomed')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
        size = 3
    else:
        folder = _ospath.join(folder, 'Full_FOV')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
        size = 20
    if color == 'black':
        cmap_2 = cmap_proteins_black[protein_2]
        cmap_3 = cmap_proteins_black[protein_3]
        reference_color = 'white'
        if zoom:
            alpha_constant = 0.5 # For Zoomed FoV
        else: 
            alpha_constant = 0.1 # For Full FoV
    elif color == 'white':
        cmap_2 = cmap_proteins_white[protein_2]
        cmap_3 = cmap_proteins_white[protein_3]
        reference_color = 'black'
        if zoom:
            alpha_constant = 1 # For Zoomed FoV
        else:
            alpha_constant = 0.2 # For Full FoV
        
    else:
        raise ValueError("Invalid color choice. Choose either 'black' or 'white'.")

    # Generate alpha values with respect to the size of the data (number of localizations)
    alpha_1 = alpha_constant * (smaller_size/data_1_size)
    alpha_2 = alpha_constant * (smaller_size/data_2_size)
    alpha_3 = alpha_constant * (smaller_size/data_3_size)
    
    fig, ax = _plt.subplots()
    ax.set_facecolor(color)
    sc1 = ax.scatter(data_1[:, 0], data_1[:, 1], alpha = reference_alpha, c = reference_color, s = size, marker = '.', edgecolors = 'none') 
    sc2 = ax.scatter(data_2_masked[:, 0], data_2_masked[:, 1], alpha = alpha_2, c = corr_2_masked, s = size, cmap = cmap_2, marker = '.', edgecolors = 'none')
    sc3 = ax.scatter(data_3_masked[:, 0], data_3_masked[:, 1], alpha = alpha_3, c = corr_3_masked, s = size, cmap = cmap_3, marker = '.', edgecolors = 'none')
    # fig.colorbar(sc2, label='Correlation')
    # fig.colorbar(sc3, label='Correlation')
    _plt.axis('equal')
    if zoom: 
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
    _plt.gca().invert_yaxis()
    # _plt.axis('off') # This doesn't work with black background. Hence, ticks are hidden explicitly in the next lines. 
    # ax.set_xticks([])
    # ax.set_yticks([])
    # ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
    # for spine in ax.spines.values():
    #     spine.set_visible(False)
    # _plt.title(f'{protein_1}, {protein_2} and {protein_3} spatial correlation map')
    _plt.savefig(_ospath.join(folder, protein_2 + '_and_' + protein_3 + '_from_' + protein_1 + '.png'), dpi = 400, bbox_inches = 'tight')
    _plt.show()

def plot_original_data(data_1, data_2, protein_1, protein_2, folder, color, zoom, center, window, reference_alpha):
    data_1_size = len(data_1)
    data_2_size = len(data_2)
    smaller_size = min(data_1_size, data_2_size)

    if zoom:
        xlim = (center[0] - (window[0]/2), center[0] + (window[0]/2))
        ylim = (center[1] - (window[1]/2), center[1] + (window[1]/2))
        folder = _ospath.join(folder, 'Zoomed')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
    else:
        folder = _ospath.join(folder, 'Full_FOV')
        if not _ospath.exists(folder):
            _os.makedirs(folder)
    if color == 'black':
        cmap = cmap_proteins_black[protein_2]
        reference_color = 'white'
        if zoom:
            alpha_constant = 0.5 # For Zoomed FoV
        else: 
            alpha_constant = 0.1 # For Full FoV
    elif color == 'white':
        cmap = cmap_proteins_white[protein_2]
        reference_color = 'black'
        if zoom:
            alpha_constant = 0.7 # For Zoomed FoV; Mock = 1; ActD = 0.7
        else:
            alpha_constant = 0.1 # For Full FoV
        
    else:
        raise ValueError("Invalid color choice. Choose either 'black' or 'white'.")

    # Generate alpha values with respect to the size of the data (number of localizations)
    size = 20
    alpha_1 = alpha_constant * (smaller_size/data_1_size)
    alpha_2 = alpha_constant * (smaller_size/data_2_size)
    
    fig, ax = _plt.subplots(figsize = (5, 5))
    ax.set_facecolor(color)
    sc1 = ax.scatter(data_1[:, 0], data_1[:, 1], alpha = reference_alpha, c = reference_color, s = size, marker='.', edgecolors = 'none')
    sc2 = ax.scatter(data_2[:, 0], data_2[:, 1], alpha = alpha_2, s = size * 2, c = to_hex(cmap(1.0)), marker = '.', edgecolors = 'none')

    _plt.axis('equal')
    if zoom: 
        _plt.xlim(xlim)
        _plt.ylim(ylim)
        ax.set_aspect('equal', adjustable='box')
    _plt.gca().invert_yaxis()
    # _plt.axis('off') # This doesn't work with black background. Hence, ticks are hidden explicitly in the next lines. 
    ax.set_xticks([])
    ax.set_yticks([])
    ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
    for spine in ax.spines.values():
        spine.set_visible(False)
    # _plt.title(f'{protein_1} and {protein_2} spatial correlation map')
    _plt.savefig(_ospath.join(folder, protein_2 + '_and_' + protein_1 + '_all_locs.png'), format = 'png', dpi = 400, bbox_inches = 'tight')
    # _plt.savefig(_ospath.join(folder, protein_2 + '_and_' + protein_1 + '_all_locs.svg'), format = 'svg', bbox_inches = 'tight')
    _plt.show()

#### Regions used in Figure 3. 

Mock:
- For cell 3 'SC35' center = (43750, 82500), window = (5000, 5000)
- For cell 3 'Lamin' center = (60000, 79500), window = (5000, 5000)

Treated:
- For cell 5 'SC35' center = (89000, 49000), window = (5000, 5000)
- For cell 5 'Lamin' center = (76000, 45500), window = (5000, 5000)
    

In [None]:
reference = 'SC35'
reference_alpha = 0.1
background = 'white' # Select 'black or 'white'
zoom = 0
center = (89000, 49000)
window = (5000, 5000)
threshold = 0.7
reference_alpha = 0.2 # Mock: 'SC35' Full FoV = 0.01; 'SC35' Zoom = 0.05; 'Lamin' Full FoV = ; 'Lamin' Zoom = 0.2 # ActD: 'SC35' Zoom = 0.2

for file_1 in data_files:
    if reference in file_1:
        protein_1 = file_1.split('_')[0]
        for file_2 in data_files:
            if reference in file_2:
                continue
            protein_2 = file_2.split('_')[0]
            print(f'Plotting {protein_1} and {protein_2} sets')
            locs_1, info_1 = load_locs(_ospath.join(data_folder, file_1))
            locs_2, info_2 = load_locs(_ospath.join(data_folder, file_2))
            data_1 = _np.column_stack((locs_1.x, locs_1.y))
            data_1 = data_1 * pixel_size
            data_2 = _np.column_stack((locs_2.x, locs_2.y))
            data_2 = data_2 * pixel_size
            corr_1 = _np.loadtxt(_ospath.join(correlation_data_folder, protein_1 + '_vs_' + protein_2 + correlation_file_extn), delimiter = ',')
            corr_2 = _np.loadtxt(_ospath.join(correlation_data_folder, protein_2 + '_vs_' + protein_1 + correlation_file_extn), delimiter = ',')
            plot_original_data(data_1, data_2, protein_1, protein_2, folder = output_folder, color = background, zoom = zoom, center = center, window = window, reference_alpha = reference_alpha)
            plot_spatial_corr_map(data_1, data_2, corr_1, corr_2, protein_1, protein_2, threshold = threshold, folder = output_folder, color = background, zoom = zoom, center = center, window = window, reference_alpha = reference_alpha)

# for file in data_files:
#     if reference in file:
#         view_point = file.split('_')[0]
#         locs, info = load_locs(_ospath.join(data_folder, file))

In [None]:
center = (89000, 49000)
window = (5000, 5000)
zoom = 0
if zoom:
    alpha = 1
else:
    alpha = 0.15

xlim = (center[0] - (window[0]/2), center[0] + (window[0]/2))
ylim = (center[1] - (window[1]/2), center[1] + (window[1]/2))

file_locs = '/Users/abhinav/Library/CloudStorage/OneDrive-IndianInstituteofScience/AnalysisFolder/Epi/Kyoto_Multiplex/ActD/10kFrames/ActD_11-03-25_Kyoto/Cleaned/Final/Cell5/Masked/SC35_Aligned_cleaned_cell5_Masked.hdf5'
locs, info = load_locs(file_locs)
data = _np.column_stack((locs.x, locs.y))
data = data * pixel_size
_plt.scatter(data[:, 0], data[:, 1], alpha=alpha, s=0.1, c='black')
_plt.axis('equal')
if zoom:
    _plt.xlim(xlim)
    _plt.ylim(ylim)
    _plt.axis('off')
_plt.gca().invert_yaxis()
_plt.show()
print(xlim)
print(ylim)

In [None]:
# Plot three targets together

reference = 'SC35'
target_1 = 'S2P'
target_2 = 'S5P'
background = 'white' # Select 'black or 'white'
reference_alpha = 0.1
zoom = 0
center = (89000, 49000)
window = (5000, 5000)

for file_1 in data_files:
    if reference in file_1:
        locs_1, info = load_locs(_ospath.join(data_folder, file_1))
        protein_1 = file_1.split('_')[0]
        for file_2 in data_files:
            if target_1 in file_2:
                locs_2, info = load_locs(_ospath.join(data_folder, file_2))
                protein_2 = file_2.split('_')[0]
                for file_3 in data_files:
                    if target_2 in file_3:
                        locs_3, info = load_locs(_ospath.join(data_folder, file_3))
                        protein_3 = file_3.split('_')[0]
                        print(f'Plotting {target_1} and {target_2} sets with {reference} as the viewpoint')
                        data_1 = _np.column_stack((locs_1.x, locs_1.y))
                        data_1 = data_1 * pixel_size
                        data_2 = _np.column_stack((locs_2.x, locs_2.y))
                        data_2 = data_2 * pixel_size
                        data_3 = _np.column_stack((locs_3.x, locs_3.y))
                        data_3 = data_3 * pixel_size
                        corr_1 = _np.loadtxt(_ospath.join(correlation_data_folder, protein_1 + '_vs_' + protein_2 + correlation_file_extn), delimiter = ',')
                        corr_2 = _np.loadtxt(_ospath.join(correlation_data_folder, protein_2 + '_vs_' + protein_1 + correlation_file_extn), delimiter = ',')
                        corr_3 = _np.loadtxt(_ospath.join(correlation_data_folder, protein_3 + '_vs_' + protein_1 + correlation_file_extn), delimiter = ',')
                        plot_spatial_corr_map_three_targets(
                            data_1, data_2, data_3, corr_1, corr_2, corr_3,
                            protein_1, protein_2, protein_3, folder = output_folder,
                            color = background, zoom = zoom, center = center,
                            window = window, reference_alpha = reference_alpha
                        )

