# Data Exploration

Studying dataset to look for opportunities to compress. 

1. Look at sampling k-mers for frequency analysis
2. Because the BWT gets a ton of nice repeats, does this mean that we might get a set of most frequent k-mers that occur even more frequently after running BWT? Let's explore!

In [None]:
import os
import gzip
from Bio import SeqIO
import pandas as pd

In [None]:
dataset = "100bacteria"
FOLDER_NAME = "genome_compression_datasets"
sample = None

In [None]:
genomes = []
benchmark_results = []
# Compress standard data
from os.path import expanduser
home = expanduser("~")
dataset_fp = os.path.join(home, FOLDER_NAME, "datasets", f"dataset_{str(dataset)}")
break_counter = 0
for fasta in os.listdir(dataset_fp):
    if sample is not None and break_counter >= sample:
        break
    # Ignore DS_Store and other hidden files
    #k = 50000
    if fasta.endswith('fna.gz'):
        fasta_fp = os.path.join(dataset_fp, fasta)
        try:
            with gzip.open(fasta_fp, "rt") as handle:
                # If want to consider filesize: print(os.fstat(handle.fileno()).st_size)
                total_genome = ""
                for record in SeqIO.parse(handle, "fasta"):
                    # sometimes genomes stored across records
                    total_genome += record.seq
                genomes.append(total_genome)
                #for x in range(len(total_genome)//k):
                #    genomes.append(total_genome[x*k:x*k + k])
            break_counter += 1
        except:
            print(f"Couldn't do {fasta}")


In [None]:
genomes = [str(g) for g in genomes]

In [None]:
from random import sample
genomes = [str(g) for g in sample(genomes, 52)]

In [None]:
len(genomes)

In [None]:
"""
- # matches with & without burrow's wheeler
- # matches as number of k-mers increases proportional to number of k-mers
"""

In [None]:
from data_process import substring_finder

In [None]:
import itertools
len([x for x in itertools.combinations([x for x in range(len(genomes[:50]))], 2)])

In [None]:
from multiprocessing import Pool
import itertools
pool = Pool(10)
out = pool.map(substring_finder, ((genomes[x[0]], genomes[x[1]]) for x in itertools.combinations([x for x in range(len(genomes[:50]))], 2)))

In [None]:
f = open("mer50000_sample50.txt", "w+")
for o in out:
    for match in o:
        f.write(f"{match}\n")
    f.write("\n")
f.close()

In [None]:
agg = [item for sublist in out for item in sublist]

In [None]:
len(agg)

In [None]:
len(set(agg))

In [None]:
import collections
counter=collections.Counter(agg)
counter.most_common(20)

In [None]:
counter.most_common(20)

In [None]:
import itertools
match_list = []
for c in list(itertools.combinations([x for x in range(len(genomes))], 2)):
    s = SequenceMatcher(None, genomes[c[0]], genomes[c[1]])
    matches = s.get_matching_blocks()
    for m in matches:
        if m.size > 1:
            match_list.append(genomes[c[0]][m.a:m.a+matches[0].size+1])

In [None]:
match_list

Begin looking at BWT.... Found it isn't really practical to run BWT on genomes due to their length....much more suited towards individual protein-coding sequences, perhaps? Something shorter than a genome...

Conclusion: While BWT may produce some great results, the resources it requires to run at scale exceeds our current computational resources. We pursue shall pursue other methods...

In [None]:
from burrowswheeler import transform

In [None]:
transformed_genomes = []

In [None]:
for g in genomes[:10]:
    transformed_genomes.append(transform(str(g)))

In [None]:
import itertools
from difflib import SequenceMatcher
match_list_trans = []
for c in list(itertools.combinations([x for x in range(len(transformed_genomes))], 2)):
    s = SequenceMatcher(None, transformed_genomes[c[0]], transformed_genomes[c[1]])
    matches = s.get_matching_blocks()
    for m in matches:
        if m.size > 1:
            match_list_trans.append(transformed_genomes[c[0]][m.a:m.a+matches[0].size+1])

In [None]:
match_list_trans

## Resampling k-mers by size

Decided to resample <=64 best 10-mers to use as reference

Resampled again based on k defined in data_process

In [None]:
from data_process import substring_counter

In [None]:
out_set = substring_counter(genomes[0])

In [None]:
from multiprocessing import Pool
import itertools
pool = Pool(10)
out = pool.map(substring_counter, genomes)

In [None]:
master_set = {}
for s in out:
    for kmer in s:
        if kmer not in master_set:
            master_set[kmer] = 0
        master_set[kmer] += s[kmer]

In [None]:
import collections
counter=collections.Counter(master_set)
most_common = counter.most_common(8192)

In [None]:
for num in [64, 32, 16, 8, 4]:
    f = open(f'references/14mer_{str(num)}.txt', 'w+')
    for x in range(num):
        f.write(f"{most_common[x][0]}\n")
    f.close()

In [None]:
most_common[:3]

In [None]:
# 10: ~20k