In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import os
import re
from scipy import stats
import gc
np.random.seed(12)
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests

In [2]:
def filterParquet(path):
	# Note, will keep the chrom column on the dataframe
	df = pd.read_parquet(path)

	# Shift methylation to between 0 and 100.
	df = df * 100

	assert df.isnull().sum().sum() == 0, "There are missing values in the dataframe"

	# Add a chromosome column
	df['chrom'] = df.index.str.split(':').str[0]

	# Drop the Sex chromosomes
	df = df[df['chrom'].str.match(pat=r'[0-9]{1,2}')]
	
	return df

In [3]:
angus_all = filterParquet('./aligned2Angus/All.samples.meth.10X.parquet')
brahman_all = filterParquet('./aligned2Brahman/All.samples.meth.10X.parquet')

In [31]:
# Get the methylation of every site in the parquet file and
# for each sample and compare, at the sample level if there is a
# significant difference in the methylation distribution

p_values = {sample: [] for sample in angus_all.columns[:-1]}

means = {'sampleID':[],'group':[],
		 'n_CpGs':[],
		 'angus_methylation_mean':[],
		 'brahman_methylation_mean':[],
		 'angus_methylation_std':[],
		 'brahman_methylation_std':[],
		 'difference':[],
		 'p_value':[],
		 'adjusted_p_value':[]}



for sample in angus_all.columns[:-1]:

	group = sample.split("_")[0]
	sampleID = sample.split("_")[1]

	# Create an index of each genome for any necessary
	# downsampling
	angus_all_idx = np.arange(angus_all.shape[0])
	brahman_all_idx = np.arange(brahman_all.shape[0])
	if len(angus_all_idx) > len(brahman_all_idx):
		angus_all_idx = np.random.choice(angus_all_idx, len(brahman_all_idx), replace=False)
	elif len(angus_all_idx) < len(brahman_all_idx):
		brahman_all_idx = np.random.choice(brahman_all_idx, len(angus_all_idx), replace=False)

	mapped_2_angus_methylation = angus_all[sample].values
	mapped_2_brahman_methylation = brahman_all[sample].values

	# Downsample the methylation arrays
	mapped_2_angus_methylation = mapped_2_angus_methylation[angus_all_idx]
	mapped_2_brahman_methylation = mapped_2_brahman_methylation[brahman_all_idx]

	# Compute the mean and median methylation for each sample
	# and genome
	means['sampleID'].append(sampleID)
	means['n_CpGs'].append(len(mapped_2_angus_methylation))
	means['angus_methylation_mean'].append(mapped_2_angus_methylation.mean())
	means['angus_methylation_std'].append(np.std(mapped_2_angus_methylation))
	#means['reference'].append('Angus')
	means['group'].append(group)
	
	#means['sampleID'].append(sampleID)
	means['brahman_methylation_mean'].append(mapped_2_brahman_methylation.mean())
	means['brahman_methylation_std'].append(np.std(mapped_2_brahman_methylation))
	#means['reference'].append('Brahman')
	#means['group'].append(group)


	# Compare the distributions of each sample
	_, pvalue = stats.wilcoxon(mapped_2_angus_methylation, mapped_2_brahman_methylation)
	p_values[sample].append(pvalue)
	means['p_value'].append(pvalue)

	# Get the percentage difference between the means
	# of the two distributions
	#percentage_diff = ((mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean()) / (mapped_2_angus_methylation.mean() + mapped_2_brahman_methylation.mean()/2)) * 100
	#difference = np.mean((np.abs(mapped_2_angus_methylation - mapped_2_brahman_methylation)))
	print(f"Sample: {sample}\nQuantification bias: {np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean())}")
	difference = np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean())
	#means['difference'].append(np.abs(percentage_diff))
	means['difference'].append(difference)
	#print(f"Sample: {sample}, p-value: {pvalue}")

Sample: AxA_F103
Quantification bias: 2.3603473613008297
Sample: AxA_F105
Quantification bias: 2.3595326171354696
Sample: AxA_F52
Quantification bias: 2.6271052784220776
Sample: AxA_F53
Quantification bias: 2.366704981203307
Sample: AxA_F60
Quantification bias: 2.3231867315550794
Sample: AxA_F7
Quantification bias: 2.3399440500307103
Sample: AxB_F100
Quantification bias: 1.4773883072076308
Sample: AxB_F104
Quantification bias: 1.1128804587393681
Sample: AxB_F106
Quantification bias: 1.0850219367152938
Sample: AxB_F61
Quantification bias: 1.389872680025008
Sample: AxB_F74
Quantification bias: 1.121484169765189
Sample: AxB_F97
Quantification bias: 0.01143455834836027
Sample: BxA_F13
Quantification bias: 0.8705831345916241
Sample: BxA_F62
Quantification bias: 0.834458390535048
Sample: BxA_F77
Quantification bias: 0.963108923837918
Sample: BxA_F80
Quantification bias: 0.9675014492105982
Sample: BxA_F8
Quantification bias: 0.93452920035478
Sample: BxA_F91
Quantification bias: 0.904043455861

In [32]:
# Test the p-values for multiple testing
multipletests(pd.DataFrame(p_values).values.flatten(), alpha=0.01, method='fdr_bh')
means['adjusted_p_value'].extend(multipletests(pd.DataFrame(p_values).values.flatten(), alpha=0.01, method='fdr_bh')[1])

In [33]:
pd.DataFrame(means).to_csv('/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/supplementary_tables/all_CpGs_sample-wise.csv', index=False)

In [4]:
# S table re. Figure3A variance and what not
all_cpgs = {'AxA':{'angus_ref_methylation':[],
				   'brahman_ref_methylation':[]},
			'AxB':{'angus_ref_methylation':[],
				   'brahman_ref_methylation':[]},
			'BxA':{'angus_ref_methylation':[],
		           'brahman_ref_methylation':[]},
			'BxB':{'angus_ref_methylation':[],
		           'brahman_ref_methylation':[]}}

stable_dict = {'sampleID':[],
			   'n_CpGs':[],
			   'angus_methylation_mean':[],
			   'brahman_methylation_mean':[],
			   'angus_methylation_std_dev':[],
			   'brahman_methylation_std_dev':[],
			   'difference':[],
			   'p_value':[],
			   'group':[]}
for sample in tqdm(angus_all.columns[:-1]):

	group = sample.split("_")[0]
	sampleID = sample.split("_")[1]

	# Create an index of each genome for any necessary
	# downsampling
	angus_all_idx = np.arange(angus_all.shape[0])
	brahman_all_idx = np.arange(brahman_all.shape[0])
	if len(angus_all_idx) > len(brahman_all_idx):
		angus_all_idx = np.random.choice(angus_all_idx, len(brahman_all_idx), replace=False)
	elif len(angus_all_idx) < len(brahman_all_idx):
		brahman_all_idx = np.random.choice(brahman_all_idx, len(angus_all_idx), replace=False)

	mapped_2_angus_methylation = angus_all[sample].values
	mapped_2_brahman_methylation = brahman_all[sample].values

	# Downsample the methylation arrays
	mapped_2_angus_methylation = mapped_2_angus_methylation[angus_all_idx]
	mapped_2_brahman_methylation = mapped_2_brahman_methylation[brahman_all_idx]

	# Compute the mean and median methylation for each sample
	# and genome
	stable_dict['sampleID'].append(sampleID)
	assert len(mapped_2_angus_methylation) == len(mapped_2_brahman_methylation), "Downsampling failed"
	stable_dict['n_CpGs'].append(len(mapped_2_angus_methylation))
	stable_dict['angus_methylation_mean'].append(mapped_2_angus_methylation.mean())
	stable_dict['brahman_methylation_mean'].append(mapped_2_brahman_methylation.mean())
	stable_dict['angus_methylation_std_dev'].append(np.std(mapped_2_angus_methylation))
	stable_dict['brahman_methylation_std_dev'].append(np.std(mapped_2_brahman_methylation))
	stable_dict['difference'].append(np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean()))
	stable_dict['p_value'].append(stats.wilcoxon(mapped_2_angus_methylation, mapped_2_brahman_methylation).pvalue)
	stable_dict['group'].append(group)
	
	# stable_dict['sampleID'].append(sampleID)
	# stable_dict['methylation_mean'].append(mapped_2_brahman_methylation.mean())
	# stable_dict['methylation_std_dev'].append(np.std(mapped_2_brahman_methylation))
	# stable_dict['reference'].append('Brahman')
	# stable_dict['group'].append(group)

	all_cpgs[group]['angus_ref_methylation'].extend(mapped_2_angus_methylation)
	all_cpgs[group]['brahman_ref_methylation'].extend(mapped_2_brahman_methylation)
#pd.DataFrame.from_dict(stable_dict).to_csv('/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/supplementary_tables/all_CpGs_per_sample.csv', index=False)



100%|██████████| 24/24 [05:05<00:00, 12.72s/it]


In [37]:
for group in all_cpgs.keys():
	print(pd.DataFrame(all_cpgs[group]))
	break

KeyboardInterrupt: 

In [6]:
group_p_vals = {'group':[],
				'n_CpGs':[],
				'angus_methylation_mean':[],
				'brahman_methylation_mean':[],
				'angus_methylation_std_dev':[],
				'brahman_methylation_std_dev':[],
				'difference':[],
				'p_value':[],
				'adjusted_p_value':[]}
pvals = []

for group in all_cpgs.keys():
	#print(f'Calculating p-value for group: {group}')
	assert len(all_cpgs[group]['angus_ref_methylation']) == len(all_cpgs[group]['brahman_ref_methylation']), "Downsampling failed"
	_, pvalue = stats.wilcoxon(all_cpgs[group]['angus_ref_methylation'], all_cpgs[group]['brahman_ref_methylation'])
	#print(f"Group: {group}, p-value: {pvalue}")
	group_p_vals['group'].append(group)
	group_p_vals['n_CpGs'].append(len(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['angus_methylation_mean'].append(np.mean(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['brahman_methylation_mean'].append(np.mean(all_cpgs[group]['brahman_ref_methylation']))
	group_p_vals['angus_methylation_std_dev'].append(np.std(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['brahman_methylation_std_dev'].append(np.std(all_cpgs[group]['brahman_ref_methylation']))
	group_p_vals['difference'].append(np.abs(np.mean(all_cpgs[group]['angus_ref_methylation']) - np.mean(all_cpgs[group]['brahman_ref_methylation'])))
	group_p_vals['p_value'].append(pvalue)
	pvals.append(pvalue)

# Test the p-values for multiple testing
multipletests(pvals, alpha=0.01, method='fdr_bh')
group_p_vals['adjusted_p_value'] = multipletests(pvals, alpha=0.01, method='fdr_bh')[1]
pd.DataFrame(group_p_vals)

Unnamed: 0,group,n_CpGs,angus_methylation_mean,brahman_methylation_mean,angus_methylation_std_dev,brahman_methylation_std_dev,difference,p_value,adjusted_p_value
0,AxA,128592426,55.647847,53.669505,24.385631,25.980464,1.978342,0.0,0.0
1,AxB,128592426,52.18483,51.39367,23.500377,24.145755,0.79116,0.0,0.0
2,BxA,128592426,50.443728,49.756729,23.384975,23.916931,0.686999,0.0,0.0
3,BxB,128592426,52.726276,53.070719,25.1318,24.898284,0.344443,0.0,0.0


In [5]:
# descriptive_stats_dict = {}
# means = {'sampleID':[],'methylation':[],'reference':[], 'group':[]}
# for sample in tqdm(angus_all.columns[:-1]):
# 	#methylation_dict = {'sampleID':[],'methylation':[],'reference':[],
# 	#'group':[]}
	
# 	# get the methylation
# 	methylation = angus_all[sample].values
# 	meth_stats = angus_all[sample].describe()
# 	# get the reference
# 	reference = 'Angus'
# 	# get the sampleID
# 	sampleID = sample.split("_")[1]
# 	group = sample.split('_')[0]
# 	# append to the dictionary
# 	# methylation_dict['sampleID'].extend(sampleID)
# 	# methylation_dict['methylation'].extend(list(methylation))
# 	# methylation_dict['reference'].extend(list(reference))
# 	# methylation_dict['group'].extend(list(group))
# 	descriptive_stats_dict[f'{sampleID}_{reference}'] = meth_stats
# 	means['sampleID'].append(sampleID)
# 	means['methylation'].append(np.mean(methylation))
# 	means['reference'].append(reference)
# 	means['group'].append(group)
# 	# pd.DataFrame.from_dict(methylation_dict).to_csv(
# 	# 	plotting_file,
# 	# 	header=False,
# 	# 	index=False)

# del angus_all
# gc.collect()

# for sample in tqdm(brahman_all.columns[:-1]):
# 	#methylation_dict = {'sampleID':[],'methylation':[],'reference':[],
# 	#'group':[]}
# 	# get the methylation
# 	methylation = brahman_all[sample].values
# 	meth_stats = brahman_all[sample].describe()
# 	# get the reference
# 	reference = 'Brahman'
# 	# get the sampleID
# 	sampleID = sample.split("_")[1]
# 	group = sample.split('_')[0]
# 	# append to the dictionary
# 	# methylation_dict['sampleID'].extend(sampleID)
# 	# methylation_dict['methylation'].extend(list(methylation))
# 	# methylation_dict['reference'].extend(list(reference))
# 	# methylation_dict['group'].extend(list(group))
# 	descriptive_stats_dict[f'{sampleID}_{reference}'] = meth_stats
# 	means['sampleID'].append(sampleID)
# 	means['methylation'].append(np.mean(methylation))
# 	means['reference'].append(reference)
# 	means['group'].append(group)
# 	# pd.DataFrame.from_dict(methylation_dict).to_csv(
# 	# 	plotting_file,
# 	# 	header=False,
# 	# 	index=False)

# del brahman_all
# gc.collect()

# print('Done')

100%|██████████| 24/24 [00:16<00:00,  1.46it/s]
100%|██████████| 24/24 [00:16<00:00,  1.45it/s]


Done


In [7]:
df = pd.DataFrame.from_dict(means)

In [9]:
colour_maps = ['dark','muted','bright','pastel']
for n, group in enumerate(['AxA', 'AxB','BxA','BxB']):
	plt.close()
	sns.set_style('whitegrid')

	mosaic_str = '''
	A
	B
	'''

	fig = plt.figure(constrained_layout=True)
	fig, ax = plt.subplot_mosaic(mosaic_str, 
							 gridspec_kw={'height_ratios': [3, 0.1]})

	sns.swarmplot(data=df[df['group'] == group],
			x='reference', y='methylation_mean',
			hue='sampleID', ax=ax['A'],
			palette=colour_maps[n])
	sns.swarmplot(data=df[df['group'] == group],
			x='reference', y='methylation_mean',
			hue='sampleID', ax=ax['B'],
			palette=colour_maps[n])
	fig.set_size_inches(2,4)
	ax['A'].set_ylim(45, 70)
	ax['B'].set_ylim(0,20)
	ax['A'].get_xaxis().set_visible(False)
	ax['A'].set_ylabel("Methylation %")
	ax['B'].set_ylabel("")
	ax['B'].set_xlabel("")
	ax['A'].legend(loc=(1.01,0.48), title='Sample ID')
	ax['A'].xaxis.tick_top()
	ax['B'].xaxis.tick_bottom()
	ax['B'].get_legend().remove()
	ax['A'].set_title(f"Group = {group}")
	ax['B'].set_yticks([])
	ax['B'].set_xlabel("Reference")
	plt.savefig(f'/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/figures/{group}_allCpGs_methylation.png', dpi=400, bbox_inches='tight')
	plt.close()

<Figure size 640x480 with 0 Axes>

In [12]:
group_df

Unnamed: 0,sampleID,methylation_mean,methylation_median,reference,group
36,F22,51.339325,56.4516,Angus,BxB
37,F22,51.677111,56.6667,Brahman,BxB
38,F46,60.019766,68.0851,Angus,BxB
39,F46,60.35379,68.2927,Brahman,BxB
40,F56,49.711692,54.2857,Angus,BxB
41,F56,50.00514,54.5455,Brahman,BxB
42,F65,49.966933,54.8387,Angus,BxB
43,F65,50.364694,55.0,Brahman,BxB
44,F78,52.664885,58.3333,Angus,BxB
45,F78,53.029043,58.6207,Brahman,BxB


In [11]:
for group in ['AxA', 'AxB','BxA','BxB']:
	group_df = df[df['group'] == group]
	angus = group_df[group_df['reference'] == 'Angus']
	brahman = group_df[group_df['reference'] == 'Brahman']
	angus_meth = angus['methylation_mean'].values
	brahman_meth = brahman['methylation_mean'].values
	test, p = stats.wilcoxon(angus_meth, brahman_meth)
	print('Group:', group)
	print('Test Statistic:', test)
	print('P-value:', p)
	print('')

Group: AxA
Test Statistic: 0.0
P-value: 0.03125

Group: AxB
Test Statistic: 1.0
P-value: 0.0625

Group: BxA
Test Statistic: 0.0
P-value: 0.03125

Group: BxB
Test Statistic: 0.0
P-value: 0.03125



## Shared CpGs

In [3]:
# load the data representing methylation when aligned to Angus
angus_all = filterParquet('./aligned2Angus/All.samples.Consensus.meth.10X.parquet')
brahman_all = filterParquet('./aligned2Brahman/All.samples.Consensus.meth.10X.parquet')

In [7]:
# Get the methylation of every site in the parquet file and
# for each sample and compare, at the sample level if there is a
# significant difference in the methylation distribution

# Get the methylation of every site in the parquet file and
# for each sample and compare, at the sample level if there is a
# significant difference in the methylation distribution

p_values = {sample: [] for sample in angus_all.columns[:-1]}

means = {'sampleID':[],'group':[],
		 'n_CpGs':[],
		 'angus_methylation_mean':[],
		 'brahman_methylation_mean':[],
		 'angus_methylation_std':[],
		 'brahman_methylation_std':[],
		 'difference':[],
		 'p_value':[],
		 'adjusted_p_value':[]}



for sample in angus_all.columns[:-1]:

	group = sample.split("_")[0]
	sampleID = sample.split("_")[1]

	# Create an index of each genome for any necessary
	# downsampling
	angus_all_idx = np.arange(angus_all.shape[0])
	brahman_all_idx = np.arange(brahman_all.shape[0])
	if len(angus_all_idx) > len(brahman_all_idx):
		angus_all_idx = np.random.choice(angus_all_idx, len(brahman_all_idx), replace=False)
	elif len(angus_all_idx) < len(brahman_all_idx):
		brahman_all_idx = np.random.choice(brahman_all_idx, len(angus_all_idx), replace=False)

	mapped_2_angus_methylation = angus_all[sample].values
	mapped_2_brahman_methylation = brahman_all[sample].values

	# Downsample the methylation arrays
	mapped_2_angus_methylation = mapped_2_angus_methylation[angus_all_idx]
	mapped_2_brahman_methylation = mapped_2_brahman_methylation[brahman_all_idx]

	# Compute the mean and median methylation for each sample
	# and genome
	means['sampleID'].append(sampleID)
	means['n_CpGs'].append(len(mapped_2_angus_methylation))
	means['angus_methylation_mean'].append(mapped_2_angus_methylation.mean())
	means['angus_methylation_std'].append(np.std(mapped_2_angus_methylation))
	#means['reference'].append('Angus')
	means['group'].append(group)
	
	#means['sampleID'].append(sampleID)
	means['brahman_methylation_mean'].append(mapped_2_brahman_methylation.mean())
	means['brahman_methylation_std'].append(np.std(mapped_2_brahman_methylation))
	#means['reference'].append('Brahman')
	#means['group'].append(group)


	# Compare the distributions of each sample
	_, pvalue = stats.wilcoxon(mapped_2_angus_methylation, mapped_2_brahman_methylation)
	p_values[sample].append(pvalue)
	means['p_value'].append(pvalue)

	# Get the percentage difference between the means
	# of the two distributions
	#percentage_diff = ((mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean()) / (mapped_2_angus_methylation.mean() + mapped_2_brahman_methylation.mean()/2)) * 100
	#difference = np.mean((np.abs(mapped_2_angus_methylation - mapped_2_brahman_methylation)))
	print(f"Sample: {sample}\nQuantification bias: {np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean())}")
	difference = np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean())
	#means['difference'].append(np.abs(percentage_diff))
	means['difference'].append(difference)
	#print(f"Sample: {sample}, p-value: {pvalue}")

Sample: AxA_F103
Quantification bias: 0.0564830957259872
Sample: AxA_F105
Quantification bias: 0.06455779129370853
Sample: AxA_F52
Quantification bias: 0.06274313680675192
Sample: AxA_F53
Quantification bias: 0.049638179956417616
Sample: AxA_F60
Quantification bias: 0.044076632715267294
Sample: AxA_F7
Quantification bias: 0.05164322907960184
Sample: AxB_F100
Quantification bias: 0.04284799230181591
Sample: AxB_F104
Quantification bias: 0.03288743230156399
Sample: AxB_F106
Quantification bias: 0.03582452031492522
Sample: AxB_F61
Quantification bias: 0.03815921084444085
Sample: AxB_F74
Quantification bias: 0.038663329894014
Sample: AxB_F97
Quantification bias: 0.025899768595458283
Sample: BxA_F13
Quantification bias: 0.029549184667231998
Sample: BxA_F62
Quantification bias: 0.028313696589876258
Sample: BxA_F77
Quantification bias: 0.03109646924883691
Sample: BxA_F80
Quantification bias: 0.03514137468794587
Sample: BxA_F8
Quantification bias: 0.03825622704746934
Sample: BxA_F91
Quantifica

In [8]:
# Test the p-values for multiple testing
multipletests(pd.DataFrame(p_values).values.flatten(), alpha=0.01, method='fdr_bh')
means['adjusted_p_value'].extend(multipletests(pd.DataFrame(p_values).values.flatten(), alpha=0.01, method='fdr_bh')[1])
pd.DataFrame(means).head()

Unnamed: 0,sampleID,group,n_CpGs,angus_methylation_mean,brahman_methylation_mean,angus_methylation_std,brahman_methylation_std,difference,p_value,adjusted_p_value
0,F103,AxA,16204834,57.057316,57.000833,24.437498,24.465181,0.056483,1.235371e-09,7.412226e-09
1,F105,AxA,16204834,62.081784,62.017226,25.810093,25.843298,0.064558,3.776387e-13,9.063328e-12
2,F52,AxA,16204834,57.873408,57.810665,24.353167,24.381333,0.062743,2.12218e-12,2.546616e-11
3,F53,AxA,16204834,53.734503,53.684865,23.452634,23.481419,0.049638,1.363609e-08,6.545324e-08
4,F60,AxA,16204834,50.491531,50.447455,23.002033,23.028587,0.044077,4.053988e-08,1.621595e-07


In [9]:
pd.DataFrame(means).to_csv('/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/supplementary_tables/shared_CpGs_sample-wise.csv', index=False)

In [10]:
# S table re. Figure3A variance and what not
all_cpgs = {'AxA':{'angus_ref_methylation':[],
				   'brahman_ref_methylation':[]},
			'AxB':{'angus_ref_methylation':[],
				   'brahman_ref_methylation':[]},
			'BxA':{'angus_ref_methylation':[],
		           'brahman_ref_methylation':[]},
			'BxB':{'angus_ref_methylation':[],
		           'brahman_ref_methylation':[]}}

stable_dict = {'sampleID':[],
			   'n_CpGs':[],
			   'angus_methylation_mean':[],
			   'brahman_methylation_mean':[],
			   'angus_methylation_std_dev':[],
			   'brahman_methylation_std_dev':[],
			   'difference':[],
			   'p_value':[],
			   'group':[]}
for sample in tqdm(angus_all.columns[:-1]):

	group = sample.split("_")[0]
	sampleID = sample.split("_")[1]

	# Create an index of each genome for any necessary
	# downsampling
	angus_all_idx = np.arange(angus_all.shape[0])
	brahman_all_idx = np.arange(brahman_all.shape[0])
	if len(angus_all_idx) > len(brahman_all_idx):
		angus_all_idx = np.random.choice(angus_all_idx, len(brahman_all_idx), replace=False)
	elif len(angus_all_idx) < len(brahman_all_idx):
		brahman_all_idx = np.random.choice(brahman_all_idx, len(angus_all_idx), replace=False)

	mapped_2_angus_methylation = angus_all[sample].values
	mapped_2_brahman_methylation = brahman_all[sample].values

	# Downsample the methylation arrays
	mapped_2_angus_methylation = mapped_2_angus_methylation[angus_all_idx]
	mapped_2_brahman_methylation = mapped_2_brahman_methylation[brahman_all_idx]

	# Compute the mean and median methylation for each sample
	# and genome
	stable_dict['sampleID'].append(sampleID)
	assert len(mapped_2_angus_methylation) == len(mapped_2_brahman_methylation), "Downsampling failed"
	stable_dict['n_CpGs'].append(len(mapped_2_angus_methylation))
	stable_dict['angus_methylation_mean'].append(mapped_2_angus_methylation.mean())
	stable_dict['brahman_methylation_mean'].append(mapped_2_brahman_methylation.mean())
	stable_dict['angus_methylation_std_dev'].append(np.std(mapped_2_angus_methylation))
	stable_dict['brahman_methylation_std_dev'].append(np.std(mapped_2_brahman_methylation))
	stable_dict['difference'].append(np.abs(mapped_2_angus_methylation.mean() - mapped_2_brahman_methylation.mean()))
	stable_dict['p_value'].append(stats.wilcoxon(mapped_2_angus_methylation, mapped_2_brahman_methylation).pvalue)
	stable_dict['group'].append(group)
	
	# stable_dict['sampleID'].append(sampleID)
	# stable_dict['methylation_mean'].append(mapped_2_brahman_methylation.mean())
	# stable_dict['methylation_std_dev'].append(np.std(mapped_2_brahman_methylation))
	# stable_dict['reference'].append('Brahman')
	# stable_dict['group'].append(group)

	all_cpgs[group]['angus_ref_methylation'].extend(mapped_2_angus_methylation)
	all_cpgs[group]['brahman_ref_methylation'].extend(mapped_2_brahman_methylation)
#pd.DataFrame.from_dict(stable_dict).to_csv('/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/supplementary_tables/shared_CpGs_per_sample.csv', index=False)


  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [03:39<00:00,  9.14s/it]


In [11]:
group_p_vals = {'group':[],
				'n_CpGs':[],
				'angus_methylation_mean':[],
				'brahman_methylation_mean':[],
				'angus_methylation_std_dev':[],
				'brahman_methylation_std_dev':[],
				'difference':[],
				'p_value':[],
				'adjusted_p_value':[]}

p_values = []

for group in all_cpgs.keys():
	#print(f'Calculating p-value for group: {group}')
	assert len(all_cpgs[group]['angus_ref_methylation']) == len(all_cpgs[group]['brahman_ref_methylation']), "Downsampling failed"
	_, pvalue = stats.wilcoxon(all_cpgs[group]['angus_ref_methylation'], all_cpgs[group]['brahman_ref_methylation'])
	#print(f"Group: {group}, p-value: {pvalue}")
	group_p_vals['group'].append(group)
	group_p_vals['n_CpGs'].append(len(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['angus_methylation_mean'].append(np.mean(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['brahman_methylation_mean'].append(np.mean(all_cpgs[group]['brahman_ref_methylation']))
	group_p_vals['angus_methylation_std_dev'].append(np.std(all_cpgs[group]['angus_ref_methylation']))
	group_p_vals['brahman_methylation_std_dev'].append(np.std(all_cpgs[group]['brahman_ref_methylation']))
	group_p_vals['difference'].append(np.abs(np.mean(all_cpgs[group]['angus_ref_methylation']) - np.mean(all_cpgs[group]['brahman_ref_methylation'])))
	group_p_vals['p_value'].append(pvalue)
	p_values.append(pvalue)
group_p_vals['adjusted_p_value'].extend(multipletests(p_values, alpha=0.01, method='fdr_bh')[1])
pd.DataFrame(group_p_vals).head()

Unnamed: 0,group,n_CpGs,angus_methylation_mean,brahman_methylation_mean,angus_methylation_std_dev,brahman_methylation_std_dev,difference,p_value,adjusted_p_value
0,AxA,97229004,55.872572,55.817859,24.405071,24.432431,0.054713,3.5408160000000004e-54,1.416326e-53
1,AxB,97229004,52.909838,52.873993,23.446976,23.478456,0.035845,3.118312e-22,6.236625e-22
2,BxA,97229004,51.141253,51.109286,23.279071,23.308486,0.031966,2.4717150000000003e-17,3.2956200000000003e-17
3,BxB,97229004,53.942699,53.921943,24.545691,24.580179,0.020756,5.794265e-05,5.794265e-05


In [7]:
pd.DataFrame.from_dict(group_p_vals).to_csv('/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/supplementary_tables/shared_CpGs_per_group.csv', index=False)

In [6]:
df = pd.DataFrame.from_dict(means)

colour_maps = ['dark','muted','bright','pastel']
for n, group in enumerate(['AxA', 'AxB','BxA','BxB']):
	plt.close()
	sns.set_style('whitegrid')

	mosaic_str = '''
	A
	B
	'''

	fig = plt.figure(constrained_layout=True)
	fig, ax = plt.subplot_mosaic(mosaic_str, 
							 gridspec_kw={'height_ratios': [3, 0.1]})

	sns.swarmplot(data=df[df['group'] == group],
			x='reference', y='methylation_mean',
			hue='sampleID', ax=ax['A'],
			palette=colour_maps[n])
	sns.swarmplot(data=df[df['group'] == group],
			x='reference', y='methylation_mean',
			hue='sampleID', ax=ax['B'],
			palette=colour_maps[n])
	fig.set_size_inches(2,4)
	ax['A'].set_ylim(45, 70)
	ax['B'].set_ylim(0,20)
	ax['A'].get_xaxis().set_visible(False)
	ax['A'].set_ylabel("Methylation %")
	ax['B'].set_ylabel("")
	ax['B'].set_xlabel("")
	ax['A'].legend(loc=(1.01,0.48), title='Sample ID')
	ax['A'].xaxis.tick_top()
	ax['B'].xaxis.tick_bottom()
	ax['B'].get_legend().remove()
	ax['A'].set_title(f"Group = {group}")
	ax['B'].set_yticks([])
	ax['B'].set_xlabel("Reference")
	plt.savefig(f'/Users/callummacphillamy/Library/CloudStorage/OneDrive-UniversityofAdelaide/PhD/methylation_chapter/GigaScience_submission/figures/{group}_SharedCpGs_methylation.png', dpi=400, bbox_inches='tight')
	plt.close()

<Figure size 640x480 with 0 Axes>

In [7]:
for group in ['AxA', 'AxB','BxA','BxB']:
	group_df = df[df['group'] == group]
	angus = group_df[group_df['reference'] == 'Angus']
	brahman = group_df[group_df['reference'] == 'Brahman']
	angus_meth = angus['methylation_mean'].values
	brahman_meth = brahman['methylation_mean'].values
	test, p = stats.wilcoxon(angus_meth, brahman_meth)
	print('Group:', group)
	print('Test Statistic:', test)
	print('P-value:', p)
	print('')

Group: AxA
Test Statistic: 0.0
P-value: 0.31731050786291415

Group: AxB
Test Statistic: 0.0
P-value: 0.31731050786291415

Group: BxA
Test Statistic: 0.0
P-value: 0.17971249487899976



  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


ValueError: zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.