In [None]:
import os
import gzip
from Bio import SeqIO
import pandas as pd

## Create the Reference

Initial analysis

In [None]:
genome_sample = []
f = open("mer50000_sample50.txt", "r")
#kmer_list = [mer_list.split("\n") for mer_list in f.read().split("\n\n")]
kmer_list = f.read().split("\n")
f.close()

In [None]:
import collections
counter=collections.Counter(kmer_list)

In [None]:
counter.most_common(20)

In [None]:
len(kmer_list)

We study different sizes of matches in sample here to find a reference that optimizes compression (see figure)

In [None]:
import collections
counter=collections.Counter([kmer for kmer in kmer_list if len(kmer) >= 7])
counter.most_common(30)

In [None]:
import collections
counter=collections.Counter([kmer for kmer in kmer_list if len(kmer) >= 8])
counter.most_common(30)

In [None]:
bigs = []
occurence_set = {}
for x in kmer_list:
    if len(x) > 50:
        bigs.append(x)
        continue
    if x not in occurence_set:
        occurence_set[x] = 0
    occurence_set[x] += 1

In [None]:
bigs

In [None]:
len(occurence_set)

In [None]:
# Very rough adjusted calculation to account for longer k-mers in the reference giving greater
# compression for less bits in the compressed genome. 
adj_o_set = {k:occurence_set[k] * (len(k)/6) for k in occurence_set.keys()}

In [None]:
import collections
counter=collections.Counter(adj_o_set)
counter.most_common(30)

In [None]:
def get_reference(f_dir):
    f = open(f_dir, 'r')
    mers = [k for k in f.read().split("\n") if k != ""]
    bit_length = int(len(mers)).bit_length() - 1
    import itertools
    bin_assignments = ["".join(seq) for seq in itertools.product("01", repeat=bit_length)]
    return {m:bin_assignments[i] for i,m in enumerate(mers)}

## Benchmarking Compression with Reference

Observing test results (high occurence of 6-mers) 

In [None]:
char_dict = {"A": "000", "C": "001", "T":"010", "G":"011", "N":"100", "-":"101", "_ref": "111"}

In [None]:
reference = {
    'GCGGCG': "00",
    'CGCCGC': "01",
    'GCGCCG': "10",
    'GGCGGC': "11",
}

In [None]:
benchmark_dict = {}

In [None]:
# k-mer length
k = 10
# Number of k-mers
num = 8
reference = get_reference(f'references/10mer_{str(num)}.txt')

In [None]:
def binarize(char):
    return char_dict[char]

In [None]:
def compress_regular(genome):
    out_genome = ""
    for g in genome:
        out_genome += binarize(g)
    return out_genome

In [None]:
def compress_kmer(genome):
    out_genome = ""
    i = 0
    while i < len(genome):
        if genome[i:i+k] in reference:
            out_genome += binarize("_ref")
            out_genome += reference[genome[i:i+k]]
            i += k
        else:
            out_genome += binarize(genome[i])
            i += 1
    return out_genome

In [None]:
def pct_compression(df):
    return ((df['compress_regular_new_len'] - df['compress_kmer_new_len']) / df['compress_regular_new_len'] * 100).to_frame('pct_compression')

In [None]:
from data_process import *

In [None]:
df = benchmark_functions([compress_regular, compress_kmer], dataset="10bacteria", sample=None)

In [None]:
benchmark_dict[num] = pct_compression(df)

In [None]:
to_plot = []
for n in [64,32,16,8,4]:
    to_plot.append((str(n), benchmark_dict[n]['pct_compression'].mean()))
pd.DataFrame(to_plot, columns=['no. references', 'pct compression'])[::-1].plot(kind='bar', x='no. references', y='pct compression', title="10-mer avg. reference compression, n = 10")

In [None]:
df[[c for c in df.columns if 'new_len' in c]].head()