# BigWig Correlations

1) From the bigwig files generates in our Ihec Pipeline tests we performed a correlation (pearson) among them
2) To run the correlations we used the bigWigCorrelator on mammouth and we get the output
3) These outputs will be used here to generate the correlation matrix


In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns


# H3K4me1 test samples

In [5]:
pd.set_option('display.max_rows', 20)
h3k4me1_df = pd.read_csv(r'/Users/gfrosi/Documents/Frosi_lab_jacques/test_pipeline_encode/BigWig_test/h3k4me1_out.txt', sep="\t", header=None, names=['File1','File2', 'correlation'])


In [6]:
h3k4me1_df

Unnamed: 0,File1,File2,correlation
0,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,0.964376
1,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,SRR9938506_1.nodup_x_ctl_for_rep2.pval.signal_...,0.448024
2,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,SRR9938506_1.nodup_x_ctl_for_rep2.pval.signal_...,0.407401
3,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,0.652358
4,SRR9938505_1.nodup_x_ctl_for_rep1.pval.signal_...,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,0.621528
...,...,...,...
23,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.723316
24,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.689804
25,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.696294
26,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.712867


In [7]:
# Function to replace part of strings 

def replace_substring_bigwig(df, regex:[()], columns:[]):
   
    df_local = df.copy()
#     h3k4me1_df['File1'] = h3k4me1_df['File1'].str.replace(r'^SRR\d+\w+\.\w{15}', 'h3k4me1').str.replace(r'\.bigwig','')
    
    for col in columns:
        for i in regex:
             df_local[col]= df_local[col].str.replace(i[0],i[1]) 
    
    
    return df_local




In [8]:
#Creating a copy of h3k4me1_df with optimized strings

h3k4me1_df1 = replace_substring_bigwig(h3k4me1_df, [(r'.signal', ''), (r'^SRR\d+\w+\.\w{15}', 'h3k4me1'), (r'.bigwig', '')], columns=['File1', 'File2'])
h3k4me1_df1

Unnamed: 0,File1,File2,correlation
0,h3k4me1_rep1.pval_1ctrl,h3k4me1_rep1.pval_4ctrl,0.964376
1,h3k4me1_rep1.pval_1ctrl,h3k4me1_rep2.pval_1ctrl,0.448024
2,h3k4me1_rep1.pval_1ctrl,h3k4me1_rep2.pval_4ctrl,0.407401
3,h3k4me1_rep1.pval_1ctrl,h3k4me1_rep3.pval_1ctrl,0.652358
4,h3k4me1_rep1.pval_1ctrl,h3k4me1_rep3.pval_4ctrl,0.621528
...,...,...,...
23,h3k4me1_rep3.pval_1ctrl,h3k4me1_rep4.pval_1ctrl,0.723316
24,h3k4me1_rep3.pval_1ctrl,h3k4me1_rep4.pval_4ctrl,0.689804
25,h3k4me1_rep3.pval_4ctrl,h3k4me1_rep4.pval_1ctrl,0.696294
26,h3k4me1_rep3.pval_4ctrl,h3k4me1_rep4.pval_4ctrl,0.712867


In [20]:
#Heatmap code

%matplotlib

result = h3k4me1_df1.pivot_table(index='File1', columns='File2', values='correlation')
#mask = np.zeros_like(result)
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(result, annot=True, fmt=".2g", cmap='coolwarm', square=True, linewidths=.5, cbar_kws={'label': 'Colorbar'}, annot_kws={'size':16})

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values


plt.title("H3K4me1", fontsize =20)
plt.xlabel('Replicates', fontsize = 15)
plt.ylabel('Replicates', fontsize = 15)




plt.show()

Using matplotlib backend: MacOSX


# Macrophage test

In [None]:
#running on mamouth

In [10]:
#loading macrophage output bigwigcorrelate

macrophage_df = pd.read_csv(r'/Users/gfrosi/Documents/Frosi_lab_jacques/test_pipeline_encode/BigWig_test/macrophage_out.txt', sep="\t", header=None, names=['File1','File2', 'correlation'])


In [11]:
macrophage_df

Unnamed: 0,File1,File2,correlation
0,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,0.994736
1,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842726_1.merged.nodup_x_ctl_for_rep2.pval....,0.810541
2,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842726_1.merged.nodup_x_ctl_for_rep2.pval....,0.806681
3,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842729_1.merged.nodup_x_ctl_for_rep3.pval....,0.780431
4,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842729_1.merged.nodup_x_ctl_for_rep3.pval....,0.776120
...,...,...,...
40,SRR1842732_1.merged.nodup_x_ctl_for_rep4.pval....,SRR333637_1.merged.nodup_x_ctl_for_rep5.pval.s...,0.669112
41,SRR1842732_1.merged.nodup_x_ctl_for_rep4.pval....,SRR333637_1.merged.nodup_x_ctl_for_rep5.pval.s...,0.663132
42,SRR1842732_1.merged.nodup_x_ctl_for_rep4.pval....,SRR333637_1.merged.nodup_x_ctl_for_rep5.pval.s...,0.666463
43,SRR1842732_1.merged.nodup_x_ctl_for_rep4.pval....,SRR333637_1.merged.nodup_x_ctl_for_rep5.pval.s...,0.671882


In [12]:
#Organizing strings 
#here the regex is different because we have the name merged in our string


macrophage_df1 = replace_substring_bigwig(macrophage_df, [(r'.signal', ''), (r'^SRR\d+\w+\.\w+\.\w{15}', 'macrophage'), (r'.bigwig', '')], columns=['File1', 'File2'])
macrophage_df1

Unnamed: 0,File1,File2,correlation
0,macrophage_rep1.pval_1ctrl,macrophage_rep1.pval_4ctrl,0.994736
1,macrophage_rep1.pval_1ctrl,macrophage_rep2.pval_1ctrl,0.810541
2,macrophage_rep1.pval_1ctrl,macrophage_rep2.pval_4ctrl,0.806681
3,macrophage_rep1.pval_1ctrl,macrophage_rep3.pval_1ctrl,0.780431
4,macrophage_rep1.pval_1ctrl,macrophage_rep3.pval_4ctrl,0.776120
...,...,...,...
40,macrophage_rep4.pval_1ctrl,macrophage_rep5.pval_1ctrl,0.669112
41,macrophage_rep4.pval_1ctrl,macrophage_rep5.pval_4ctrl,0.663132
42,macrophage_rep4.pval_4ctrl,macrophage_rep5.pval_1ctrl,0.666463
43,macrophage_rep4.pval_4ctrl,macrophage_rep5.pval_4ctrl,0.671882


In [21]:
#heatmap macrophage 

%matplotlib

result_1 = macrophage_df1.pivot_table(index='File1', columns='File2', values='correlation')
#mask = np.zeros_like(result)
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(result_1, annot=True, fmt=".2g", square=True, linewidths=.5, cbar_kws={'label': 'Colorbar'}, annot_kws={'size':16})


#To avoid the cutted squares at the top and bottom is necessary to add a space

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values



plt.title("Macrophage", fontsize =20)
plt.xlabel('Replicates', fontsize = 15)
plt.ylabel('Replicates', fontsize = 15)
plt.show() # ta-da!



Using matplotlib backend: MacOSX


# Correlation among all samples

In [23]:
#loading all_samples output bigwigcorrelate

all_df = pd.read_csv(r'/Users/gfrosi/Documents/Frosi_lab_jacques/test_pipeline_encode/BigWig_test/all_samples_out.txt', sep="\t", header=None, names=['File1','File2', 'correlation'])


In [24]:
all_df

Unnamed: 0,File1,File2,correlation
0,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,0.994736
1,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842726_1.merged.nodup_x_ctl_for_rep2.pval....,0.810541
2,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842726_1.merged.nodup_x_ctl_for_rep2.pval....,0.806681
3,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842729_1.merged.nodup_x_ctl_for_rep3.pval....,0.780431
4,SRR1842723_1.merged.nodup_x_ctl_for_rep1.pval....,SRR1842729_1.merged.nodup_x_ctl_for_rep3.pval....,0.776120
...,...,...,...
148,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.723316
149,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.689804
150,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.696294
151,SRR9938507_1.nodup_x_ctl_for_rep3.pval.signal_...,SRR9938508_1.nodup_x_ctl_for_rep4.pval.signal_...,0.712867


In [28]:
#Organizing strings 
#here we need to specify two regex (one to h3k4me1 and macrophage)


all_df1 = replace_substring_bigwig(all_df, [(r'^SRR\d+\w+\.\w+\.\w{15}', 'macrophage'), (r'^SRR\d+\w+\.\w{15}', 'h3k4me1'), (r'.signal', ''), (r'.bigwig', '')], columns=['File1', 'File2'])
all_df1


Unnamed: 0,File1,File2,correlation
0,macrophage_rep1.pval_1ctrl,macrophage_rep1.pval_4ctrl,0.994736
1,macrophage_rep1.pval_1ctrl,macrophage_rep2.pval_1ctrl,0.810541
2,macrophage_rep1.pval_1ctrl,macrophage_rep2.pval_4ctrl,0.806681
3,macrophage_rep1.pval_1ctrl,macrophage_rep3.pval_1ctrl,0.780431
4,macrophage_rep1.pval_1ctrl,macrophage_rep3.pval_4ctrl,0.776120
...,...,...,...
148,h3k4me1_rep3.pval_1ctrl,h3k4me1_rep4.pval_1ctrl,0.723316
149,h3k4me1_rep3.pval_1ctrl,h3k4me1_rep4.pval_4ctrl,0.689804
150,h3k4me1_rep3.pval_4ctrl,h3k4me1_rep4.pval_1ctrl,0.696294
151,h3k4me1_rep3.pval_4ctrl,h3k4me1_rep4.pval_4ctrl,0.712867


In [32]:
#%matplotlib

result_2 = all_df1.pivot_table(index='File1', columns='File2', values='correlation')
#mask = np.zeros_like(result)
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(result_2, annot=True, fmt=".2g", cmap="BuPu" ,square=True, linewidths=.5, cbar_kws={'label': 'Colorbar'}, annot_kws={'size':8})


#To avoid the cutted squares at the top and bottom is necessary to add a space

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values



plt.title("H3K4me1 & Macrophage", fontsize =20)
plt.xlabel('Replicates', fontsize = 15)
plt.ylabel('Replicates', fontsize = 15)
plt.show() # ta-da!
