In [None]:
from scipy.stats import wilcoxon
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyfaidx import Fasta
import re
from statsmodels.stats.multitest import multipletests

In [None]:
# Repeat for each of the four groups
group = 'BxB'

a = pd.read_csv(f'{group}_cpg_methylation_mapped_to_angus.bed', sep='\t', header=None,
				dtype={0:str, 1:int, 2:int})
    # a = a.copy()
    # a['a_id'] = a[0].astype(str) + ':' + a[1].astype(str) + '-' + a[2].astype(str)
a['coord_id'] = a[0].astype(str) + ':' + a[1].astype(str) + '-' + a[2].astype(str)


b = pd.read_csv(f'{group}_cpg_methylation_mapped_to_brahman.bed', sep='\t', header=None,
				dtype={0:str, 1:int, 2:int})
    # b = b.copy()
    # b['b_id'] = b[0].astype(str) + ':' + b[1].astype(str) + '-' + b[2].astype(str)


In [None]:
print(a['coord_id'].nunique())
print(b[3].nunique())

In [None]:
# Create a dictionary where the key is the 1Mb window id and the value is
# another dictionary. The inner dictionary will have the keys 'angus' and
# 'brahman' where the values are a vector of methylation values.
data = {}
for row in tqdm(a.itertuples(), total=len(a)):
	coord_id = row.coord_id
	try:
		methyl = (str(row._7).split(','))
		methyl = [float(x) for x in methyl]
	except ValueError:
		methyl = [np.nan]*6

	if coord_id not in data:
		data[coord_id] = {'angus': [], 'brahman': []}
	data[coord_id]['angus'].extend(methyl)

for row in tqdm(b.itertuples(), total=len(b)):
	coord_id = row[4]
	try:
		methyl = (str(row._8).split(','))
		methyl = [float(x) for x in methyl]
	except ValueError:
		methyl = [np.nan]*6

	if coord_id not in data:
		print(f'{coord_id} not in data')
		data[coord_id] = {'angus': [np.nan]*6,
					'brahman': []}
	data[coord_id]['brahman'].extend(methyl)

In [None]:
p_vals = {'coord_id':[],
		  'mean_angus_methylation':[],
		  'mean_brahman_methylation':[],
		  'p_val':[]}
uncomparable = {'coord_id':[],
				'mean_angus_methylation':[],
				'mean_brahman_methylation':[]}

for k in tqdm(data.keys()):
	a = np.array(data[k]['angus'])
	b = np.array(data[k]['brahman'])
	if np.nan in a or np.nan in b:
		uncomparable['coord_id'].append(k)
		uncomparable['mean_angus_methylation'].append(np.nanmean(a))
		uncomparable['mean_brahman_methylation'].append(np.nanmean(b))
	else:
		p_vals['coord_id'].append(k)
		p_vals['mean_angus_methylation'].append(np.mean(a))
		p_vals['mean_brahman_methylation'].append(np.mean(b))
		try:
			p_vals['p_val'].append(wilcoxon(a, b).pvalue)
		except ValueError:
			p_vals['p_val'].append(1.)

In [None]:
x = pd.DataFrame(p_vals)

# Drop rows where the p-value is nan
x_no_na = x.dropna(subset=['p_val'])
x_no_na = x_no_na.copy()


x_no_na['padj'] = multipletests(x_no_na['p_val'].values, method='fdr_bh')[1]
print((x_no_na[x_no_na['padj'] <= 0.05 ]).shape)
x_no_na[x_no_na['padj'] <= 0.05 ].to_csv(f'{group}_ref_comparison_methylation_sig.csv', index=False)