# In this notebook we visualize effects of inserting a single CTCF in multiple backgrounds

In [1]:
import bioframe
import pandas as pd
import numpy as np
import tensorflow as tf
from basenji import dna_io
import akita_utils
import matplotlib.pyplot as plt
import glob
import seaborn as sns

# sns.set_theme()
import akita_utils.format_io

# Importing scores from simulations

In [56]:
# def load_data(h5_dirs, scd_stats, models=[1]):
#     dfs = []
#     for h5_file in glob.glob(h5_dirs):
        
#         dfs.append(
#             akita_utils.format_io.h5_to_df(h5_file, scd_stats, drop_duplicates_key=None)
#         )
#     dfs = pd.concat(dfs)

#     for stat in scd_stats:
#         for model in models:
#             dfs[f"mean_{stat}_score_m{model}"] = (
#                 dfs[f"{stat}_h1_m{model}_t0"]
#                 + dfs[f"{stat}_h1_m{model}_t1"]
#                 + dfs[f"{stat}_h1_m{model}_t2"]
#                 + dfs[f"{stat}_h1_m{model}_t3"]
#                 + dfs[f"{stat}_h1_m{model}_t4"]
#                 + dfs[f"{stat}_h1_m{model}_t5"]
#             ) / 6
        
#     return dfs.reset_index(drop=True)


# import numpy as np
def load_data(h5_dirs, scd_stats, models=[1]):
    
    model_dfs = {} 
    for h5_num, h5_file in enumerate(glob.glob(h5_dirs)):
        
        for model in models:
            if f"model{model}" in h5_file:  
                df = akita_utils.format_io.h5_to_df(h5_file, scd_stats, drop_duplicates_key=None)
                if f"model{model}" in model_dfs.keys():
                    model_dfs[f"model{model}"] = pd.concat([model_dfs[f"model{model}"], df])
                else:
                    model_dfs[f"model{model}"] = df  
            
    
    for df_num,df in enumerate(model_dfs.values()):
        if df_num ==0:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='background_seqs', how='inner', suffixes=("",f"_{df_num}"))
    
    for stat in scd_stats:
        for model in models:
            merged_df[f"mean_{stat}_score_m{model}"] = (
                  merged_df[f"{stat}_h1_m{model}_t0"]
                + merged_df[f"{stat}_h1_m{model}_t1"]
                + merged_df[f"{stat}_h1_m{model}_t2"]
                + merged_df[f"{stat}_h1_m{model}_t3"]
                + merged_df[f"{stat}_h1_m{model}_t4"]
                + merged_df[f"{stat}_h1_m{model}_t5"]
            ) / 6
        
        # Calculate mean score over models for current statistic
        mean_score = np.mean([
            merged_df[f"mean_{stat}_score_m{model}"] for model in models
        ], axis=0)
        
        # Add mean score column to DataFrame
        merged_df[f"mean_{stat}_score"] = mean_score
    
    return merged_df.reset_index(drop=True)


In [57]:
# def load_data(h5_dirs, scd_stats, models=[1], batch_size=1000):
#     batches = []
#     for h5_file in glob.glob(h5_dirs):
#         df = akita_utils.format_io.h5_to_df(h5_file, scd_stats, drop_duplicates_key=None)
#         batches.append(df)
#         if len(batches) == batch_size:
#             batch_df = pd.concat(batches, ignore_index=True)
#             batches = []
#             for stat in scd_stats:
#                 for model in models:
#                     batch_df[f"mean_{stat}_score_m{model}"] = (
#                         batch_df[f"{stat}_h1_m{model}_t0"]
#                         + batch_df[f"{stat}_h1_m{model}_t1"]
#                         + batch_df[f"{stat}_h1_m{model}_t2"]
#                         + batch_df[f"{stat}_h1_m{model}_t3"]
#                         + batch_df[f"{stat}_h1_m{model}_t4"]
#                         + batch_df[f"{stat}_h1_m{model}_t5"]
#                     ) / 6

#                 # Calculate mean score over models for current statistic
#                 mean_score = np.mean([
#                     batch_df[f"mean_{stat}_score_m{model}"] for model in models
#                 ], axis=0)

#                 # Add mean score column to DataFrame
#                 batch_df[f"mean_{stat}_score"] = mean_score
#             batches.append(batch_df)

#     if len(batches) > 0:
#         batch_df = pd.concat(batches, ignore_index=True)
#         for stat in scd_stats:
#             for model in models:
#                 batch_df[f"mean_{stat}_score_m{model}"] = (
#                     batch_df[f"{stat}_h1_m{model}_t0"]
#                     + batch_df[f"{stat}_h1_m{model}_t1"]
#                     + batch_df[f"{stat}_h1_m{model}_t2"]
#                     + batch_df[f"{stat}_h1_m{model}_t3"]
#                     + batch_df[f"{stat}_h1_m{model}_t4"]
#                     + batch_df[f"{stat}_h1_m{model}_t5"]
#                 ) / 6

#             # Calculate mean score over models for current statistic
#             mean_score = np.mean([
#                 batch_df[f"mean_{stat}_score_m{model}"] for model in models
#             ], axis=0)

#             # Add mean score column to DataFrame
#             batch_df[f"mean_{stat}_score"] = mean_score
#         batches.append(batch_df)

#     return pd.concat(batches, ignore_index=True)


In [58]:
uniformly_selcted_model1_motifs_h5_dirs = (
 "/scratch1/kamulege/insert_experiments/1000_uniformly_selcted_model1_motifs/*/*/*.h5"
)

uniformly_selcted_model1_motifs_df = load_data(uniformly_selcted_model1_motifs_h5_dirs, scd_stats=["SCD", "SSD"], models=[1])

uniformly_selcted_model1_motifs_df

Unnamed: 0,CTCF_1_genomic_SCD,CTCF_1_insert,SCD_h1_m1_t0,SCD_h1_m1_t1,SCD_h1_m1_t2,SCD_h1_m1_t3,SCD_h1_m1_t4,SCD_h1_m1_t5,SSD_h1_m1_t0,SSD_h1_m1_t1,SSD_h1_m1_t2,SSD_h1_m1_t3,SSD_h1_m1_t4,SSD_h1_m1_t5,background_seqs,mean_SCD_score_m1,mean_SCD_score,mean_SSD_score_m1,mean_SSD_score
0,14.490,"chr4,110020474,110020493,-,20,0,>",22.031250,26.937500,15.382812,13.351562,18.953125,14.101562,22.031250,26.937500,15.382812,13.351562,18.953125,14.101562,0,18.468750,18.468750,18.468750,18.468750
1,14.570,"chr2,17842151,17842170,-,20,0,>",12.351562,15.625000,9.875000,8.562500,11.476562,8.531250,12.351562,15.625000,9.875000,8.562500,11.476562,8.531250,0,11.062500,11.062500,11.062500,11.062500
2,14.620,"chr6,106800147,106800166,-,20,0,>",29.375000,34.875000,18.515625,16.140625,23.921875,17.859375,29.375000,34.875000,18.515625,16.140625,23.921875,17.859375,0,23.437500,23.437500,23.437500,23.437500
3,14.695,"chr10,122090659,122090678,+,20,0,>",12.156250,16.031250,11.757812,10.164062,12.773438,9.468750,12.156250,16.031250,11.757812,10.164062,12.773438,9.468750,0,12.062500,12.062500,12.062500,12.062500
4,14.720,"chr2,76114953,76114972,-,20,0,>",31.953125,39.218750,24.031250,21.015625,28.921875,21.546875,31.953125,39.218750,24.031250,21.015625,28.921875,21.546875,0,27.765625,27.765625,27.765625,27.765625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630,6.715,"chr10,69656507,69656526,+,20,0,>",15.585938,18.156250,8.218750,7.179688,11.328125,8.507812,15.585938,18.156250,8.218750,7.179688,11.328125,8.507812,0,11.500000,11.500000,11.500000,11.500000
631,6.758,"chr8,32003714,32003733,-,20,0,>",7.218750,9.437500,6.800781,5.914062,7.429688,5.511719,7.218750,9.437500,6.800781,5.914062,7.429688,5.511719,0,7.050781,7.050781,7.050781,7.050781
632,6.816,"chr11,109866716,109866735,+,20,0,>",9.429688,11.695312,6.886719,6.007812,8.265625,6.160156,9.429688,11.695312,6.886719,6.007812,8.265625,6.160156,0,8.078125,8.078125,8.078125,8.078125
633,6.880,"chr17,65541198,65541217,+,20,0,>",15.625000,18.125000,8.070312,7.054688,11.218750,8.437500,15.625000,18.125000,8.070312,7.054688,11.218750,8.437500,0,11.414062,11.414062,11.414062,11.414062


In [None]:
uniformly_selcted_model1_motifs_df.columns

In [None]:
# normalize the data
max_ctcf = uniformly_selcted_model1_motifs_df['CTCF_1_genomic_SCD'].max()
max_mean_scd = uniformly_selcted_model1_motifs_df['mean_SCD_score'].max()
uniformly_selcted_model1_motifs_df['CTCF_1_genomic_SCD_norm'] = uniformly_selcted_model1_motifs_df['CTCF_1_genomic_SCD'] / max_ctcf
uniformly_selcted_model1_motifs_df['mean_SCD_score_norm'] = uniformly_selcted_model1_motifs_df['mean_SCD_score'] / max_mean_scd

# create the plot
fig, ax = plt.subplots(figsize=(8, 8)) #, dpi=200

data = uniformly_selcted_model1_motifs_df[(uniformly_selcted_model1_motifs_df['mean_SCD_score_norm']>0.8) & (uniformly_selcted_model1_motifs_df['CTCF_1_genomic_SCD_norm']<0.5)]

display(data)

sns.scatterplot(data=data, ax=ax, x="CTCF_1_genomic_SCD_norm", y="mean_SCD_score_norm", label="mean_SCD_score")

sns.despine(top=True, right=True)
plt.xticks(rotation=90)
plt.title("Distribution of mean SCD scores \n \n ")

# Add bands for high and low correlation
sns.regplot(data=uniformly_selcted_model1_motifs_df, ax=ax, x="CTCF_1_genomic_SCD_norm", y="mean_SCD_score_norm", scatter=False, color="red", label="High Correlation")
sns.regplot(data=uniformly_selcted_model1_motifs_df, ax=ax, x="CTCF_1_genomic_SCD_norm", y="mean_SCD_score_norm", scatter=False, color="blue", label="Low Correlation")

plt.xlabel("Deletion SCD (Normalized)")
plt.ylabel("Insertion SCD (Normalized)")
plt.legend()
plt.show()



In [None]:
# [i for i in range(-1000,1050,50)]


In [None]:
# Calculate correlation between two columns
corr = df['column1'].corr(df['column2'])

# Check if correlation coefficient is close to zero
if abs(corr) < 0.1:
    # Select corresponding points for further analysis
    selected_points = df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value)]