In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns

In [2]:
preds = np.load("/scratch1/smaruj/test_mouse_fold0_AkitaV2/combined_pred_matrices.npy", allow_pickle=True)

In [3]:
targets = np.load("/scratch1/smaruj/test_mouse_fold0_AkitaV2/combined_target_matrices.npy", allow_pickle=True)

In [None]:
def plot_hexbin_accumulated(preds, targets, cell_type_idx, cell_type_name, save_path=None):
    # Concatenate all predictions and targets across the 725 test windows
    pred_vec_all = []
    target_vec_all = []
    
    for i in range(preds.shape[0]):
        # Flatten the 512x512 maps and concatenate
        pred_vec_all.append(preds[i, :, :, cell_type_idx].flatten())
        target_vec_all.append(targets[i, :, :, cell_type_idx].flatten())
    
    # Convert the list of arrays into a single array
    pred_vec_all = np.concatenate(pred_vec_all)
    target_vec_all = np.concatenate(target_vec_all)

    # Filter out NaN values
    mask = np.isfinite(pred_vec_all) & np.isfinite(target_vec_all)
    pred_vec_filtered = pred_vec_all[mask]
    target_vec_filtered = target_vec_all[mask]

    # Calculate Pearson correlation only on finite values
    corr, _ = pearsonr(pred_vec_filtered, target_vec_filtered) if len(pred_vec_filtered) > 0 else (np.nan, np.nan)

    # Create hexbin plot
    plt.figure(figsize=(7, 6))
    hb = plt.hexbin(pred_vec_filtered, target_vec_filtered, gridsize=50, cmap='viridis', bins='log')
    plt.colorbar(label='log10(frequency)')
    
    plt.xlabel(f'pred-{cell_type_name}')
    plt.ylabel(f'expt-{cell_type_name}')
    plt.title(f'corr: {corr:.2f}')
    plt.grid(True)
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', format="pdf")
    
    plt.show()

In [4]:
cell_types = ['mESC (Hsieh2019)', 'mESC (Bonev2017)', 'cortical neuron', 'neocortex cortical neuron', 'neural progenitor cell', 'neocortex neural progenitor cell']

In [None]:
for idx, cell_type in enumerate(cell_types):
    print(f"Plotting for {cell_type}...")
    plot_hexbin_accumulated(preds, targets, cell_type_idx=idx, cell_type_name=cell_type) #, save_path=f"./plots_mouse/{cell_type}_corr.pdf")

In [None]:
def calculate_cell_type_correlation(matrix):
    num_cell_types = matrix.shape[-1]
    correlation_matrix = np.zeros((num_cell_types, num_cell_types))
    
    # Iterate over all pairs of cell types
    for i in range(num_cell_types):
        for j in range(i, num_cell_types):
            # Flatten the 512x512 maps for both cell types
            cell_type_i = matrix[:, :, :, i].flatten()
            cell_type_j = matrix[:, :, :, j].flatten()
            
            # Create a mask for valid (non-NaN) entries
            mask = ~np.isnan(cell_type_i) & ~np.isnan(cell_type_j)
            
            if np.any(mask):  # Check if there are any valid pairs
                # Calculate correlation between the two cell types using valid entries
                corr, _ = pearsonr(cell_type_i[mask], cell_type_j[mask])
            else:
                corr = np.nan  # Set correlation to NaN if no valid pairs
            
            correlation_matrix[i, j] = corr
            correlation_matrix[j, i] = corr  # Symmetric matrix
    
    return correlation_matrix

In [None]:
# Function to calculate average correlations between cell types
def average_cell_type_correlations(matrix):
    num_cell_types = matrix.shape[-1]
    
    # Initialize matrix to accumulate correlations across all windows
    corr_sum = np.zeros((num_cell_types, num_cell_types))
    
    # Iterate over all windows
    for i in range(matrix.shape[0]):
        # Calculate correlation matrix for current window
        corr = calculate_cell_type_correlation(matrix[i:i+1])
        
        # Accumulate the correlations
        corr_sum += corr
    
    # Average correlations over all windows
    corr_avg = corr_sum / matrix.shape[0]
    
    return corr_avg

In [None]:
# Function to plot heatmap of correlations
def plot_heatmap(corr_matrix, title, cell_type_names, save_path=None):
    plt.figure(figsize=(8, 6))
    
    mask = np.tril(np.ones_like(corr_matrix, dtype=bool))
    
    # vmin = np.min(corr_matrix)
    # vmax = np.max(corr_matrix)
    
    vmin = 0.09
    vmax = 0.30
    
    # Create the heatmap
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='viridis', 
                vmin=vmin, vmax=vmax, mask=mask, cbar_kws={"shrink": .8},
                xticklabels=cell_type_names, yticklabels=cell_type_names)
    
    plt.title(title)
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', format="pdf")
    
    plt.show()

In [None]:
# Calculate the average correlations for preds
pred_corr_avg = average_cell_type_correlations(preds)

In [None]:
plot_heatmap(pred_corr_avg, "Average Correlation Matrix for Predictions", cell_type_names=cell_types) #, save_path="./plots_mouse/predictions_cell_type_corr.pdf")

In [None]:
del pred_corr_avg

In [None]:
# Calculate the average correlations for preds
targ_corr_avg = average_cell_type_correlations(targets)

In [None]:
plot_heatmap(targ_corr_avg, "Average Correlation Matrix for Targets", cell_type_names=cell_types) #, save_path="./plots_mouse/targets_cell_type_corr.pdf")

In [None]:
del targ_corr_avg

In [None]:
def calculate_cell_type_differences_and_correlations(preds, targets):
    num_cell_types = preds.shape[3]
    correlations = np.zeros((num_cell_types, num_cell_types))  # Store correlations between differences

    for i in range(num_cell_types):
        for j in range(num_cell_types):
            # Calculate differences between cell types i and j for preds and targets
            preds_diff = preds[:, :, :, i] - preds[:, :, :, j]  # Difference between predictions
            targets_diff = targets[:, :, :, i] - targets[:, :, :, j]  # Difference between targets

            # Flatten the differences for correlation calculation
            preds_diff_flat = preds_diff.reshape(preds_diff.shape[0], -1)
            targets_diff_flat = targets_diff.reshape(targets_diff.shape[0], -1)

            # Compute the correlation only for finite values
            mask = np.isfinite(preds_diff_flat) & np.isfinite(targets_diff_flat)
            if np.any(mask):  # Check if there are any valid values
                corr, _ = pearsonr(preds_diff_flat[mask], targets_diff_flat[mask])
                correlations[i, j] = corr
            else:
                correlations[i, j] = np.nan  # No valid values to compute correlation

    return correlations

In [None]:
diff_corr = calculate_cell_type_differences_and_correlations(preds, targets)

In [None]:
plot_heatmap(diff_corr, "Average Cell-Type Difference Correlation Matrix", cell_type_names=cell_types) #, save_path="./plots_mouse/cell_type_differences_corr.pdf")

In [7]:
def calculate_average_differences_and_correlations(preds, targets):
    num_windows, height, width, num_cell_types = preds.shape
    ave_correlations = np.zeros(num_windows, dtype=float)
    total_pairs = num_cell_types * (num_cell_types - 1) // 2  # Number of cell type pairs (i, j)

    # Loop over each window
    for k in range(num_windows):
        if k % 100 == 0:
            print(f"Calculating window {k}")
        total_corr = 0  # Accumulate correlations for this window
        pair_count = 0  # Track number of valid pairs

        # Calculate pairwise differences between cell types for preds and targets
        for i in range(num_cell_types):
            for j in range(i + 1, num_cell_types):
                preds_diff = preds[k, :, :, i] - preds[k, :, :, j]
                targets_diff = targets[k, :, :, i] - targets[k, :, :, j]

                # Flatten and mask finite values for correlation calculation
                preds_diff_flat = preds_diff.ravel()
                targets_diff_flat = targets_diff.ravel()
                mask = np.isfinite(preds_diff_flat) & np.isfinite(targets_diff_flat)

                # Calculate correlation if there are valid values
                if np.any(mask):
                    corr, _ = pearsonr(preds_diff_flat[mask], targets_diff_flat[mask])
                    total_corr += corr
                    pair_count += 1

        # Average the correlations for this window
        ave_correlations[k] = total_corr / pair_count if pair_count > 0 else np.nan

    return ave_correlations


In [8]:
window_correlations = calculate_average_differences_and_correlations(preds, targets)

Calculating window 0
Calculating window 100
Calculating window 200
Calculating window 300
Calculating window 400
Calculating window 500
Calculating window 600
Calculating window 700


In [9]:
import pandas as pd
test_path = "/project/fudenber_735/tensorflow_models/akita/v2/data/mm10/sequences.bed"
sequences_V2 = pd.read_csv(test_path, sep='\t', names=['chr','start','stop','type'])
fold0 = sequences_V2[sequences_V2["type"] == "fold0"]

In [11]:
fold0["ave_corr"] = window_correlations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold0["ave_corr"] = window_correlations


In [13]:
fold0.to_csv('mouse_fold0_avecorr.tsv', sep='\t', index=False) 