In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import sys
import os
import scipy
import math
from sklearn.metrics import r2_score
from pathlib import Path
import subprocess
import pandas as pd

from Bio.SeqIO import read, parse
import orthoani

from multiprocess import Pool

max_pool = 16
lim_file = 200

sys.path.append('../')
sys.path.append('../../')


In [2]:
from db_sketching.kmer_set import KMerSet, FracMinHash, TruncatedKMerSet, MeanFracMinHash

In [3]:
# analysis_type = "Single-Species"
# analysis_data_dir = "Escherichia coli" 
# analysis_data_dir = "Yersinia pestis" 

# analysis_type = "Single-Genus"
# analysis_data_dir = "Yersinia" 
# analysis_data_dir = "Salmonella"

In [18]:
data_dir = "../../data_temp"
genome_files = []

for collection_type in os.listdir(data_dir):
    collection_type_path = os.path.join(data_dir,collection_type)
    for collection in os.listdir(collection_type_path):
        collection_path = os.path.join(collection_type_path,collection)
        for filename in os.listdir(collection_path):
            full_filename = os.path.join(collection_path,filename)
            genome_files.append(full_filename)


print(genome_files)
print(len(genome_files))

['../../data_temp/Single-Species/Erwinia amylovora/64.fna', '../../data_temp/Single-Species/Erwinia amylovora/32.fna', '../../data_temp/Single-Species/Erwinia amylovora/99.fna', '../../data_temp/Single-Species/Erwinia amylovora/43.fna', '../../data_temp/Single-Species/Erwinia amylovora/7.fna', '../../data_temp/Single-Species/Erwinia amylovora/98.fna', '../../data_temp/Single-Species/Erwinia amylovora/58.fna', '../../data_temp/Single-Species/Erwinia amylovora/90.fna', '../../data_temp/Single-Species/Erwinia amylovora/14.fna', '../../data_temp/Single-Species/Erwinia amylovora/41.fna', '../../data_temp/Single-Species/Erwinia amylovora/67.fna', '../../data_temp/Single-Species/Erwinia amylovora/73.fna', '../../data_temp/Single-Species/Erwinia amylovora/17.fna', '../../data_temp/Single-Species/Erwinia amylovora/56.fna', '../../data_temp/Single-Species/Erwinia amylovora/50.fna', '../../data_temp/Single-Species/Erwinia amylovora/2.fna', '../../data_temp/Single-Species/Erwinia amylovora/46.fna'

In [19]:
checked_genome_files = []
for file in genome_files:
    try:
        parsed_file = parse(file,"fasta")
        assert(len([record for record in parsed_file]) > 0)
        checked_genome_files.append(file)
    except:
        print(f"File {file} is damaged / invalid")


File ../../data_temp/Single-Species/Neisseria gonorrhoeae/485 is damaged / invalid
File ../../data_temp/Single-Species/Corynebacterium pseudotuberculosis/25.fna is damaged / invalid
File ../../data_temp/Single-Species/Klebsiella quasipneumoniae/48.fna is damaged / invalid
File ../../data_temp/Single-Species/Xylella fastidiosa/73.zip is damaged / invalid
File ../../data_temp/Single-Species/Xylella fastidiosa/73.fna is damaged / invalid
File ../../data_temp/Single-Species/Xylella fastidiosa/README.md is damaged / invalid
File ../../data_temp/Single-Species/Xylella fastidiosa/ncbi_dataset is damaged / invalid
File ../../data_temp/Single-Species/Yersinia pestis/632 is damaged / invalid
File ../../data_temp/Single-Species/Klebsiella variicola/203.fna is damaged / invalid
File ../../data_temp/Single-Species/Escherichia coli/623 is damaged / invalid
File ../../data_temp/Single-Species/Bordetella pertussis/520 is damaged / invalid
File ../../data_temp/Single-Genus/Escherichia/29.fna is damaged

In [20]:
print(checked_genome_files)
print(len(checked_genome_files))

['../../data_temp/Single-Species/Erwinia amylovora/64.fna', '../../data_temp/Single-Species/Erwinia amylovora/32.fna', '../../data_temp/Single-Species/Erwinia amylovora/99.fna', '../../data_temp/Single-Species/Erwinia amylovora/43.fna', '../../data_temp/Single-Species/Erwinia amylovora/7.fna', '../../data_temp/Single-Species/Erwinia amylovora/98.fna', '../../data_temp/Single-Species/Erwinia amylovora/58.fna', '../../data_temp/Single-Species/Erwinia amylovora/90.fna', '../../data_temp/Single-Species/Erwinia amylovora/14.fna', '../../data_temp/Single-Species/Erwinia amylovora/41.fna', '../../data_temp/Single-Species/Erwinia amylovora/67.fna', '../../data_temp/Single-Species/Erwinia amylovora/73.fna', '../../data_temp/Single-Species/Erwinia amylovora/17.fna', '../../data_temp/Single-Species/Erwinia amylovora/56.fna', '../../data_temp/Single-Species/Erwinia amylovora/50.fna', '../../data_temp/Single-Species/Erwinia amylovora/2.fna', '../../data_temp/Single-Species/Erwinia amylovora/46.fna'

In [21]:

def get_genome_length(genome_file):
    genome_kmer = KMerSet(20)
    genome_kmer.insert_file(genome_file)
    return genome_kmer.length

def compute_length_parallel(genome_files):
    with Pool(max_pool) as p:
        return p.starmap(get_genome_length,([(g_file,) for g_file in genome_files]))

    
checked_genome_lengths = compute_length_parallel(checked_genome_files)
print(checked_genome_lengths)


[3759962, 3829893, 3908088, 3808806, 3848776, 3877594, 3766970, 3763017, 3764235, 3761459, 3789097, 3860165, 3800710, 3832131, 3879641, 3762133, 3765325, 3766561, 3763163, 3904074, 3820617, 3954918, 3803641, 3895498, 3842113, 3852410, 3759916, 3760416, 3770616, 3829237, 3767466, 3764113, 3833832, 3808694, 3810400, 3805381, 3951821, 3831803, 3763841, 3763583, 3950144, 3881762, 3811100, 3832260, 3765206, 3765658, 3867640, 3832333, 3809288, 3846490, 3765966, 3763875, 3755139, 3805200, 3847053, 3806422, 3910597, 3796731, 3776817, 3765359, 3816202, 3795525, 3908030, 3760046, 3903046, 3815262, 3807206, 3873716, 3758622, 3903530, 3820850, 3792204, 3903890, 3830580, 3812516, 3833161, 3832333, 3875317, 3811308, 3905604, 3832332, 3795813, 3951347, 3873287, 3825260, 3777657, 3764644, 3830788, 3766431, 3819515, 3803867, 3898330, 3812388, 3811504, 3898094, 3841051, 3761272, 3835589, 3849188, 3763804, 2145098, 2151065, 2165438, 2153514, 2148713, 2151628, 2220847, 2143916, 2168997, 2137404, 2060575, 

In [38]:
def get_num_kmers(genome_file, kmer_length, canonical = False):
    genome_kmer = KMerSet(kmer_length, canonical = canonical)
    genome_kmer.insert_file(genome_file)
    return len(genome_kmer.set)

def compute_num_kmers_parallel(genome_files,kmer_length, canonical=False):
    with Pool(max_pool) as p:
        return p.starmap(get_num_kmers,([(g_file,kmer_length,canonical) for g_file in genome_files]))


In [39]:
kmer_counts = [0] * 40

In [43]:
canonical = True

In [45]:
for kmer_length in range(6,40):
    print(f"Processing kmers of length {kmer_length}")
    kmer_counts[kmer_length] = compute_num_kmers_parallel(checked_genome_files,kmer_length,canonical)
    fig = plt.figure(figsize=(5,5),dpi=600)
    ax = fig.add_subplot()
    ax.scatter(checked_genome_lengths,kmer_counts[kmer_length],marker="^",s=[2 for _ in checked_genome_lengths])
    ax.scatter(checked_genome_lengths, [min(4 ** kmer_length,genome_length) for genome_length in checked_genome_lengths],marker=".",s=[2 for _ in checked_genome_lengths])
    plt.plot()
    plt.savefig(f"../../plots/kmer_counting/{len(checked_genome_lengths)}genomes-{canonical=}-{kmer_length}mer-counts.png")
    plt.close()

Processing kmers of length 6
Processing kmers of length 7


KeyboardInterrupt: 

In [30]:
Path(f"../../plots/kmer_counting/").mkdir(parents=True, exist_ok=True)

for kmer_length in range(6,23+1):
    fig = plt.figure(figsize=(5,5),dpi=600)
    ax = fig.add_subplot()
    ax.scatter(checked_genome_lengths,kmer_counts[kmer_length],marker="^",s=[2 for _ in checked_genome_lengths])
    ax.scatter(checked_genome_lengths, [min(4 ** kmer_length,genome_length) for genome_length in checked_genome_lengths],marker=".",s=[2 for _ in checked_genome_lengths])
    plt.plot()
    plt.savefig(f"../../plots/kmer_counting/{len(checked_genome_lengths)}genomes-{canonical=}-{kmer_length}mer-counts.png")
    plt.close()

    



In [31]:
import pandas as pd

In [32]:
kmer_counts_dict = {}
kmer_counts_dict['filename'] = checked_genome_files
for i in range(6,39+1):
    kmer_counts_dict[f"{i}-mer counts"] = kmer_counts[i]

In [33]:
kmer_counts_df = pd.DataFrame(kmer_counts_dict)

In [37]:
kmer_counts_df.to_csv(f"../../data_temp/kmer_counts-{canonical=}.csv")