In [None]:
import pandas as pd
import numpy as np
np.random.seed(12)
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os
import itertools
import re
from pyfaidx import Fasta
from pybedtools import BedTool
from scipy import stats
from Bio import SeqIO
import gc
from scipy.stats import mannwhitneyu

In [None]:
kmer=3
def count_specific_kmer(seq_dict, kmer):
    k_count = 0
    for seq in seq_dict.values():
        for i in range(len(seq) - len(kmer) + 1):
            if str(seq[i:i+len(kmer)]) == kmer:
                k_count += 1
    return k_count

def expected_specific_kmer(seq_dict, kmer):
    big_seq = ''.join(list(seq_dict.values()))
    expect_kmer = {k:0 for k in kmer}
    for k in kmer:
        expect_kmer[k] = (big_seq.count(k))
    exp = [expect_kmer[k] for k in kmer]
    #print(exp)
    exp = (np.product(exp))/len(big_seq)
    return exp

def makeNonCpGWindows(not_snps, window):
    new_df = {'chrom':[],
          'start':[],
          'stop':[]}

    for i in tqdm(not_snps.itertuples()):
        end = i.end - ((window * 2) + 2 - 1)
        for j in range(i.start, end, 10):
            start = j
            stop = j + ((window*2) + 2)
            new_df['chrom'].append(i.chrom)
            new_df['start'].append(start)
            new_df['stop'].append(stop)
    non_cpg_windows = pd.DataFrame.from_dict(new_df)
    return non_cpg_windows

def count_kmer(seq, k):
    k_count = {''.join(i):0 for i in itertools.product(['A','C','G','T'], repeat=k)}

    for i in range(len(seq) - k + 1):
        if ' ' in seq[i:i+k]:
            continue
        k_count[str(seq[i:i+k])] += 1
    return k_count

# normalise
def normalise(kmer_dict, seq_len):
    denominator = len(kmer_dict.keys()) * (seq_len - 2)
    for kmer, value in kmer_dict.items():
        kmer_dict[kmer] /= denominator
    return kmer_dict

def getFasta(df, fasta):
    fasta_dict = {}
    for row in df.itertuples():
        record = fasta[str(row[1])][row[2]:row[3]]
        if 'N' in record.seq:
            continue
        fasta_dict[record.fancy_name] = record.seq
    return fasta_dict

In [None]:

big_df_dict = {}

brahman = Fasta('/path/to/ref/Bos_indicus_hybrid.UOA_Brahman_1.ARS-UCD1.2.orientation.dna.toplevel.fa') ## Change to your path
angus = Fasta('/path/to/ref/Angus.ARS-UCD1.2.orientation.fa') ## Change to your path

names = ['chrom','start','stop','query_coords']
brahman_size = {'chrom':[],
                'len':[]}
angus_size = {'chrom':[],
              'len':[]}
for i in brahman.records:
    brahman_size['chrom'].append(i)
    brahman_size['len'].append(len(brahman[i]))
for i in angus.records:
    angus_size['chrom'].append(i)
    angus_size['len'].append(len(angus[i]))
angus_size_df = pd.DataFrame.from_dict(angus_size)
brahman_size_df = pd.DataFrame.from_dict(brahman_size)

In [None]:
window = 500 # We used 25, 50 and 500
a2b_snps = pd.read_csv('./genome/Angus_CpG/shared_CpGs/Angus.CpGs.chrALL.to.Brahman_coords.SNP_change.bed',
                       header=None,
                   index_col=None,
                   sep='\t',
                       names=names)
b2a_snps = pd.read_csv('./genome/Brahman_CpG/shared_CpGs/Brahman.CpGs.chrALL.to.Angus_coords.SNP_change.bed',
                   header=None,
                   index_col=None,
                   sep='\t',
                       names=names)
a2b_snps['start'] = a2b_snps['start'] - window
a2b_snps['stop'] = a2b_snps['stop'] + window
b2a_snps['start'] = b2a_snps['start'] - window
b2a_snps['stop'] = b2a_snps['stop'] + window

a2b_snps_bed = BedTool.from_dataframe(a2b_snps)
b2a_snps_bed = BedTool.from_dataframe(b2a_snps)

a2b_not_snps_bed = a2b_snps_bed.complement(g='/Users/callummacphillamy/PhD/Reference_Genomes/common_UOA-Brahman-Angus/Brahman_oriented2_ARS/Bos_indicus_hybrid.UOA_Brahman_1.ARS-UCD1.2.orientation.dna.toplevel.chrom.sizes', L=True)
b2a_not_snps_bed = b2a_snps_bed.complement(g='/Users/callummacphillamy/PhD/Reference_Genomes/common_UOA-Brahman-Angus/Angus/Angus.ARS-UCD1.2.orientation.chrom.sizes', L=True)

a2b_not_snps = a2b_not_snps_bed.to_dataframe()
b2a_not_snps = b2a_not_snps_bed.to_dataframe()

del a2b_not_snps_bed
del b2a_not_snps_bed
del a2b_snps_bed
del b2a_snps_bed
gc.collect()

a2b_not_snps = a2b_not_snps[a2b_not_snps.iloc[:,0].isin([i for i in range(1,30)])]
b2a_not_snps = b2a_not_snps[b2a_not_snps.iloc[:,0].isin([i for i in range(1,30)])]

print('Generating non CpG windows')
a2b_not_snps_windows = makeNonCpGWindows(a2b_not_snps, window=window)
b2a_not_snps_windows = makeNonCpGWindows(b2a_not_snps, window=window)


In [None]:
kmer_size = 3
permutation_dict_1000 = {}
for n, i in tqdm(enumerate(range(100))):
    permutation_dict_1000[n] = {} 
    b_cpg_idx = np.arange(a2b_snps.shape[0])
    b_cpg_idx_samp = np.random.choice(b_cpg_idx, size=1000, replace=False)
    a2b_snps_small = a2b_snps.iloc[b_cpg_idx_samp, :]

    a_cpg_idx = np.arange(b2a_snps.shape[0])
    a_cpg_idx_samp = np.random.choice(a_cpg_idx, size=1000, replace=False)
    b2a_snps_small = b2a_snps.iloc[a_cpg_idx_samp, :]

    # Take 1000 random samples from the non-cpg windows
    b_non_cpg_idx = np.arange(a2b_not_snps_windows.shape[0])
    a_non_cpg_idx = np.arange(b2a_not_snps_windows.shape[0])

    b_non_cpg_idx_samp = np.random.choice(b_non_cpg_idx, size=1000, replace=False)
    a_non_cpg_idx_samp = np.random.choice(a_non_cpg_idx, size=1000, replace=False)

    a2b_non_small = a2b_not_snps_windows.iloc[b_non_cpg_idx_samp, :]
    b2a_non_small = b2a_not_snps_windows.iloc[a_non_cpg_idx_samp, :]

    a2b_fa = getFasta(a2b_snps_small, brahman)
    b2a_fa = getFasta(b2a_snps_small, angus)

    a2b_not_fa = getFasta(a2b_non_small, brahman)
    b2a_not_fa = getFasta(b2a_non_small, angus)

    #a_cpg_df_dict = {''.join(i):0 for i in itertools.product(['A','C','G','T'], repeat=k)}
    #a_non_cpg_df_dict = {''.join(i):[] for i in itertools.product(['A','C','G','T'], repeat=k)}
    #b_cpg_df_dict = {''.join(i):[] for i in itertools.product(['A','C','G','T'], repeat=k)}
    #b_non_cpg_df_dict = {''.join(i):[] for i in itertools.product(['A','C','G','T'], repeat=k)}
    a2b_big_seq = ' '.join(a2b_fa.values())
    b2a_big_seq = ' '.join(b2a_fa.values())
    a2b_not_big_seq = ' '.join(a2b_not_fa.values())
    b2a_not_big_seq = ' '.join(b2a_not_fa.values())

    assert len(list(a2b_fa.values())[0]) == (window*2) + 2
    assert len(list(b2a_fa.values())[0]) == (window*2) + 2
    assert len(list(a2b_not_fa.values())[0]) == (window*2) + 2
    assert len(list(b2a_not_fa.values())[0]) == (window*2) + 2

    kmer_dict_a2b = count_kmer(a2b_big_seq, k=kmer_size)
    kmer_dict_b2a = count_kmer(b2a_big_seq, k=kmer_size)

    kmer_dict_a2b_not = count_kmer(a2b_not_big_seq, k=kmer_size)
    kmer_dict_b2a_not = count_kmer(b2a_not_big_seq, k=kmer_size)


    norm_a2b_fa = normalise(kmer_dict_a2b, seq_len=len(list(a2b_fa.values())[0]))
    norm_b2a_fa = normalise(kmer_dict_b2a, seq_len=len(list(b2a_fa.values())[0]))

    norm_a2b_not_fa = normalise(kmer_dict_a2b_not, seq_len=len(list(a2b_not_fa.values())[0]))
    norm_b2a_not_fa = normalise(kmer_dict_b2a_not, seq_len=len(list(b2a_not_fa.values())[0]))


    #for k, v in a_cpg_df_dict.items():
    #    freq = v/(1000 * (((window*2)+2)-2))
    #    big_df_dict[k].append(freq)
    permutation_dict_1000[n]['A2B_SNP_kmers'] = norm_a2b_fa 
    permutation_dict_1000[n]['B2A_SNP_kmers'] = norm_b2a_fa
    permutation_dict_1000[n]['A2B_notSNP_kmers'] = norm_a2b_not_fa
    permutation_dict_1000[n]['B2A_notSNP_kmers'] = norm_b2a_not_fa