In [None]:
from scipy.stats import wilcoxon
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyfaidx import Fasta
import re

## PAF format
| col | type | description |
| --- | ---- | ----------- |
| 1   | str  | query sequence name |
| 2   | int  | query sequence length |
| 3   | int  | query start coordinate (0-based) |
| 4   | int  | query end coordinate (0-based) |
| 5   | str  | strand of the query sequence |
| 6   | str  | target sequence name |
| 7   | int  | target sequence length |
| 8   | int  | target start coordinate (0-based) |
| 9   | int  | target end coordinate (0-based) |
| 10  | int  | number of residue matches |
| 11  | int  | alignment block length |
| 12  | int  | mapping quality (0-255), 255 = missing |


In [None]:
paf = pd.read_csv('Angus_100kb_windows_to_Brahman_ref.paf',
				  sep='\t',
				  header=None,
				  usecols=[i for i in range(12)],
				  names = ['qname', 'qlen', 'qstart', 'qend', 'strand', 'tname',
			               'tlen', 'tstart', 'tend', 'matches', 'blockLen',
						   'mapQ'])
paf['qchrom'] = paf['qname'].str.split(':').str[0]

# Keep only alignments that are on the same chromosome
paf = paf[paf['qchrom'] == paf['tname']]
paf = paf[paf['mapQ'] == 60]
paf = paf.loc[paf.groupby('qname')['blockLen'].idxmax()]
# paf.head()

In [None]:
paf.shape

In [None]:
angus_windows = {'chrom':[],
					 'start':[],
					 'end':[]}
angus_windows_brahman_coords = {'chrom':[],
					   'start':[],
					   'end':[],
					   'angus_coords':[]}


for window in tqdm(paf.itertuples()):
	
	q_chrom, q_start, q_end = re.split(r'[:|-]', window.qname)

	if q_chrom == window.tname:
		angus_windows_brahman_coords['chrom'].append(q_chrom)
		angus_windows_brahman_coords['start'].append(q_start)
		angus_windows_brahman_coords['end'].append(q_end)
		angus_windows_brahman_coords['angus_coords'].append(f'{window.tname}:{window.tstart}-{window.tend}')
	
		angus_windows['chrom'].append(window.tname)
		angus_windows['start'].append(window.tstart)
		angus_windows['end'].append(window.tend)

In [None]:
pd.DataFrame(angus_windows).to_csv('filtered_Angus_100kb_windows.bed', sep='\t', index=False, header=False)
pd.DataFrame(angus_windows_brahman_coords).to_csv('filtered_Angus_100kb_windows_w_Brahman_coords.bed', sep='\t', index=False, header=False)

In [None]:
# Load the CpG methylation data for all samples
for group in ['AxA','AxB','BxA','BxB']:
    methylation_calls_angus = pd.read_parquet('/Users/callummacphillamy/PhD/methylation_chapter/clean_gigascience_revisions/methylation_patterns/aligned2Angus/All.samples.meth.10X.parquet')
    methylation_calls_angus = methylation_calls_angus[[i for i in methylation_calls_angus.columns if group in i]]
    methylation_calls_brahman = pd.read_parquet('/Users/callummacphillamy/PhD/methylation_chapter/clean_gigascience_revisions/methylation_patterns/aligned2Brahman/All.samples.meth.10X.parquet')
    methylation_calls_brahman = methylation_calls_brahman[[i for i in methylation_calls_angus.columns if group in i]]

    # Convert the methylation calls to a bed file, chrom, start, stop, methylation (csv)
    angus_angus_coord_methto_bed = {'chrom':[],
								'start':[],
								'stop':[],
								'methylation':[]}

    angus_brahman_coord_methto_bed = {'chrom':[],
								  'start':[],
								  'stop':[],
								  'methylation':[]}

    for row in tqdm(methylation_calls_angus.itertuples(), total=methylation_calls_angus.shape[0]):
        #print(row)
        chrom, start = row[0].split(':')[0:2]
        stop = int(start) + 1
        methylation = ','.join([str(i) for i in row[1:]])
        angus_angus_coord_methto_bed['chrom'].append(chrom)
        angus_angus_coord_methto_bed['start'].append(start)
        angus_angus_coord_methto_bed['stop'].append(stop)
        angus_angus_coord_methto_bed['methylation'].append(methylation)

    for row in tqdm(methylation_calls_brahman.itertuples(), total=methylation_calls_brahman.shape[0]):
        #print(row)
        chrom, start = row[0].split(':')[0:2]
        stop = int(start) + 1
        methylation = ','.join([str(i) for i in row[1:]])
        angus_brahman_coord_methto_bed['chrom'].append(chrom)
        angus_brahman_coord_methto_bed['start'].append(start)
        angus_brahman_coord_methto_bed['stop'].append(stop)
        angus_brahman_coord_methto_bed['methylation'].append(methylation)

    pd.DataFrame(angus_angus_coord_methto_bed).to_csv(f'{group}_angus_angus_coord_methto_bed_100kb.bed', sep='\t', index=False, header=False)
    pd.DataFrame(angus_brahman_coord_methto_bed).to_csv(f'{group}_angus_brahman_coord_methto_bed_100kb.bed', sep='\t', index=False, header=False)