In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import sys
import os
import scipy
import math
from sklearn.metrics import r2_score
from pathlib import Path
import subprocess

from Bio.SeqIO import read, parse
import orthoani

from multiprocess import Pool
import pandas as pd

max_pool = 16
lim_file = 200

sys.path.append('../')
sys.path.append('../../')


In [2]:
from db_sketching.kmer_set import KMerSet, FracMinHash, TruncatedKMerSet, MeanFracMinHash

In [3]:
def cond(kmer_hash):
    hash = (976369 * kmer_hash + 1982627) % 10000
    if hash < 50:
        return True
    else:
        return False
    

def generate_polycond(degree, mod, c, seed_val):
    rng = np.random.default_rng(seed_val)
    coeffs = np.random.randint(
        low = 0,
        high = mod,
        size = degree+1
    )

    def poly_cond(kmer_hash):
        hash_mod = kmer_hash % mod
        end_val = 0
        for coeff in reversed(coeffs):
            end_val *= hash_mod
            end_val += coeff
            end_val %= mod

        return end_val < (mod / c)
    
    return poly_cond

def all_cond(kmer_hash):
    return True

In [4]:
data_home_dir = "../../data_temp"

species_analysis_type = "Single-Species"
species_analysis_data_dirs = {
    "Escherichia coli": (10,100),
    "Lactobacillus helveticus": (10,100),
    "Staphylococcus hominis": (10,100),
    "Mycoplasmoides pneumoniae": (10,100),
    "Brucella melitensis": (10,100),
    "Xanthomonas oryzae": (10,100)
}

genus_analysis_type = "Single-Genus"
genus_analysis_data_dirs = {
    "Pectobacterium": (10,35),
    "Morganella": (10,35),
    "Xylella": (10,35),
}

family_analysis_type = "Single-Family"
family_analysis_data_dirs = {
    "Enterobacteriaceae": (10,35),
    "Cyanobiaceae": (10,35),
    "Rhizobiaceae": (10,35)
}

family_genus_analysis_type = "Single-Family-Multi-Genus"
family_genus_analysis_data_dirs = {
    "31989": (10,35),
    "49546": (10,35),
    "186803": (10,35),
    "186817": (10,35),
}

genus_species_analysis_type = "Single-Genus-Multi-Species"
genus_species_analysis_data_dirs = {
    "Psychrobacter": (10,100),
    "Aeromonas": (10,100),
    "Rathayibacter": (10,100),
}

In [5]:
analysis_types = {
    species_analysis_type: species_analysis_data_dirs,
    genus_analysis_type: genus_analysis_data_dirs,
    family_analysis_type: family_analysis_data_dirs
}

In [6]:
c_val_range = (200,)

In [7]:
# Helper function to obtain genome files
def get_genome_files(data_home_dir, analysis_type, analysis_data_dir):
    genome_files = []
    collection_path = os.path.join(data_home_dir,analysis_type,analysis_data_dir)
    for filename in os.listdir(collection_path):
        full_filename = os.path.join(collection_path,filename)
        genome_files.append(full_filename)
    return genome_files
    

In [8]:
def check_genome_files(genome_files):
    checked_genome_files = []
    for file in genome_files:
        try:
            parsed_file = parse(file,"fasta")
            assert(len([record for record in parsed_file]) > 0)
            checked_genome_files.append(file)
        except:
            print(f"File {file} is damaged / invalid")

    return checked_genome_files


In [9]:
def get_genome_length(genome_file):
    genome_kmer = KMerSet(3)
    genome_kmer.insert_file(genome_file)
    return genome_kmer.length

def compute_length_parallel(genome_files):
    with Pool(max_pool) as p:
        return p.starmap(get_genome_length,([(g_file,) for g_file in genome_files]))

In [10]:
def compute_ortho_ani(genome_file_1, genome_file_2):
    try:
        genome_1_read = parse(genome_file_1,"fasta")
        genome_2_read = parse(genome_file_2,"fasta")
        ortho_ani_value = orthoani.orthoani(genome_1_read,genome_2_read)
        return ortho_ani_value
    except:
        return 0

def compute_ortho_ani_parallel(genome_files_1,genome_files_2):
    args = [(g1,g2) for g1,g2 in zip(genome_files_1,genome_files_2)]
    with Pool(max_pool) as p:
        return p.starmap(compute_ortho_ani,args)

def compute_pairwise_ortho(genome_files):
    genome_files_1, genome_files_2 = zip(*[(g1,g2) for g1 in genome_files for g2 in genome_files])
    return compute_ortho_ani_parallel(genome_files_1,genome_files_2)

In [11]:
def compute_kmer_sketches(genome_file, kmer_class, sketching_condition, kmer_length, canonical):
    genome_kmer = kmer_class(sketching_condition, kmer_length, canonical)
    genome_kmer.insert_file(genome_file)
    return genome_kmer

def compute_kmer_sketches_parallel(genome_files, kmer_length, kmer_class, sketching_cond, canonical):
    args = [(g,kmer_class,sketching_cond,kmer_length,canonical) for g in genome_files]
    with Pool(max_pool) as p:
        return p.starmap(compute_kmer_sketches,args)


def compute_kmer_ani(genome_1_kmer, genome_2_kmer):    
    kmer_estimated_ani = genome_1_kmer.ANI_estimation(genome_2_kmer)
    return kmer_estimated_ani


def compute_kmer_ani_parallel(kmer_sketches_1,kmer_sketches_2):
    args = [(s1,s2) for s1,s2 in zip(kmer_sketches_1,kmer_sketches_2)]
    with Pool(max_pool) as p:
        return p.starmap(compute_kmer_ani,args)

In [12]:
ortho_ani_filename = "../../data_temp/ortho_ani_values.csv"
ortho_ani_dataframe = pd.read_csv(ortho_ani_filename,index_col=0)
print(ortho_ani_dataframe)

                                         genome_file_1  \
0       ../../data_temp/Single-Genus/Salmonella/64.fna   
1       ../../data_temp/Single-Genus/Salmonella/32.fna   
2       ../../data_temp/Single-Genus/Salmonella/99.fna   
3       ../../data_temp/Single-Genus/Salmonella/43.fna   
4        ../../data_temp/Single-Genus/Salmonella/7.fna   
..                                                 ...   
494  ../../data_temp/Single-Family/Rhizobiaceae/441...   
495  ../../data_temp/Single-Family/Rhizobiaceae/292...   
496  ../../data_temp/Single-Family/Rhizobiaceae/83.fna   
497  ../../data_temp/Single-Family/Rhizobiaceae/129...   
498  ../../data_temp/Single-Family/Rhizobiaceae/291...   

                                         genome_file_2  orthoani_val  \
0       ../../data_temp/Single-Genus/Salmonella/32.fna      0.985312   
1       ../../data_temp/Single-Genus/Salmonella/99.fna      0.984921   
2       ../../data_temp/Single-Genus/Salmonella/43.fna      0.983598   
3        ../../

In [13]:
kmer_sketch_ani_filename = "../../data_temp/kmer_sketch_ani_values.csv"
kmer_sketch_dataframe = pd.read_csv(kmer_sketch_ani_filename,index_col=0)
print(kmer_sketch_dataframe)

                                         genome_file_1  \
0    ../../data_temp/Single-Species/Escherichia col...   
1    ../../data_temp/Single-Species/Escherichia col...   
2    ../../data_temp/Single-Species/Escherichia col...   
3    ../../data_temp/Single-Species/Escherichia col...   
4    ../../data_temp/Single-Species/Escherichia col...   
..                                                 ...   
494  ../../data_temp/Single-Family/Rhizobiaceae/441...   
495  ../../data_temp/Single-Family/Rhizobiaceae/292...   
496  ../../data_temp/Single-Family/Rhizobiaceae/83.fna   
497  ../../data_temp/Single-Family/Rhizobiaceae/129...   
498  ../../data_temp/Single-Family/Rhizobiaceae/291...   

                                         genome_file_2  kmer_length  c_val  \
0    ../../data_temp/Single-Species/Escherichia col...           10     20   
1    ../../data_temp/Single-Species/Escherichia col...           10     20   
2    ../../data_temp/Single-Species/Escherichia col...           10  

In [14]:
corrleation_filename = "../../data_temp/correlation_values.csv"
correlation_dataframe = pd.read_csv(corrleation_filename,index_col=0)
print(correlation_dataframe)

    kmer_length  c_val  canonical  pearson_coeff  pearson_coeff_pval  \
0            10     20       True       0.907790        2.218581e-38   
0            10     20      False       0.421223        1.413929e-05   
0            10     50       True       0.911519        3.280371e-39   
0            10     50      False       0.425088        1.155857e-05   
0            10    100       True       0.916422        2.327374e-40   
..          ...    ...        ...            ...                 ...   
0            14    200       True       0.990251        0.000000e+00   
0            15    200       True       0.994699        0.000000e+00   
0            16    200       True       0.996540        0.000000e+00   
0            17    200       True       0.997151        0.000000e+00   
0            18    200       True       0.997380        0.000000e+00   

    spearman_coeff  spearman_coeff_pval   analysis_type analysis_data_dir  \
0         0.866259         5.424778e-31  Single-Species  E

In [15]:
kmer_class = FracMinHash

In [16]:
iteration = 0

for analysis_type in analysis_types:
    for analysis_data_dir in analysis_types[analysis_type]:
        genome_files = get_genome_files(data_home_dir,analysis_type,analysis_data_dir)
        checked_genome_files = check_genome_files(genome_files)

        print(f"{analysis_type=}, {analysis_data_dir=}")
        print(f"Using genome files {checked_genome_files}")
        
        checked_genome_files_off1 = checked_genome_files[1:]

        ortho_ani_condition = (
            ortho_ani_dataframe["genome_file_1"].isin(checked_genome_files)
        )

        filtered_ortho_ani_dataframe = ortho_ani_dataframe[ortho_ani_condition]
        complement_ortho_ani_dataframe = ortho_ani_dataframe[~ortho_ani_condition]

        if len(filtered_ortho_ani_dataframe) == 0:
            print("Computing OrthoANI values ...")
            ortho_ani_vals = compute_ortho_ani_parallel(checked_genome_files,checked_genome_files_off1)
            print("Computed OrthoANI values")
        else:
            print("Using previously computed OrthoANI values")
            ortho_ani_vals = filtered_ortho_ani_dataframe["orthoani_val"]

        computed_ortho_ani_dataframe = pd.DataFrame(
            {
                "analysis_type": analysis_type,
                "analysis_data_dir": analysis_data_dir,
                "genome_file_1": checked_genome_files[:-1],
                "genome_file_2": checked_genome_files_off1,
                "orthoani_val": ortho_ani_vals
            }
        )
        ortho_ani_dataframe = pd.concat([complement_ortho_ani_dataframe,computed_ortho_ani_dataframe])

        min_kmer_length, max_kmer_length = analysis_types[analysis_type][analysis_data_dir]
        
        for kmer_length in range(min_kmer_length,max_kmer_length+1):
            for c_val in c_val_range:
                canon = True

                iteration += 1

                print(f"ITERATION {iteration} : {kmer_length=}, {c_val=}, {canon=}")

                checked_genome_files_off1 = checked_genome_files[1:]

                kmer_sketch_ani_condition = (
                    (kmer_sketch_dataframe["genome_file_1"].isin(checked_genome_files)) &
                    (kmer_sketch_dataframe["kmer_length"] == kmer_length) &
                    (kmer_sketch_dataframe["c_val"] == c_val) &
                    (kmer_sketch_dataframe["canonical"] == canon)
                )

                filtered_kmer_sketch_ani_dataframe = kmer_sketch_dataframe[kmer_sketch_ani_condition]
                complement_kmer_sketch_ani_dataframe = kmer_sketch_dataframe[~kmer_sketch_ani_condition]

                if len(filtered_kmer_sketch_ani_dataframe) == 0:
                    polycond = generate_polycond(3,1000000007,c_val,iteration)
                    print("Computing sketches ...")
                    kmer_sketches = compute_kmer_sketches_parallel(
                        checked_genome_files,
                        kmer_length = kmer_length,
                        kmer_class = kmer_class,
                        sketching_cond = polycond,
                        canonical = canon
                    )
                    kmer_sketches_off1 = kmer_sketches[1:]
                    print("Sketching complete, computing ani ...")
                    kmer_ani_vals = compute_kmer_ani_parallel(kmer_sketches,kmer_sketches_off1)
                    print("ANI computation complete")
                else:
                    print("Using previously computed ANI values")
                    kmer_ani_vals = filtered_kmer_sketch_ani_dataframe["kmer_ani_val"]


                num_entries = len(kmer_ani_vals)
                
                computed_kmer_sketch_ani_dataframe = pd.DataFrame(
                    {
                        "analysis_type": analysis_type,
                        "analysis_data_dir": analysis_data_dir,
                        "genome_file_1": checked_genome_files[:-1],
                        "genome_file_2": checked_genome_files_off1,
                        "kmer_length": [kmer_length for _ in range(num_entries)],
                        "c_val": [c_val for _ in range(num_entries)],
                        "canonical": [canon for _ in range(num_entries)],
                        "kmer_ani_val": kmer_ani_vals
                    }
                )


                kmer_sketch_dataframe = pd.concat(
                    [complement_kmer_sketch_ani_dataframe,computed_kmer_sketch_ani_dataframe]
                )

                correlation_condition = (
                    (correlation_dataframe["analysis_type"] == analysis_type) &
                    (correlation_dataframe["analysis_data_dir"] == analysis_data_dir) &
                    (correlation_dataframe["kmer_length"] == kmer_length) &
                    (correlation_dataframe["c_val"] == c_val) & 
                    (correlation_dataframe["canonical"] == canon)
                )

                filtered_correlation_dataframe = correlation_dataframe[~correlation_condition] # remove to recompute

                print("Computing correlations...")


                # filtered_ortho_ani_dataframe = ortho_ani_dataframe[ortho_ani_condition]
                # filtered_kmer_sketch_ani_dataframe = kmer_sketch_dataframe[kmer_sketch_ani_condition]

                # combined_ani_dataframe = filtered_ortho_ani_dataframe.merge(filtered_kmer_sketch_ani_dataframe)


                # ortho_ani_vals = combined_ani_dataframe["orthoani_val"]
                # kmer_ani_vals = combined_ani_dataframe["kmer_ani_val"]

                
                filter_o, filter_k = zip(*[(o,k) for o,k in zip(ortho_ani_vals,kmer_ani_vals) if (k > 0 and o > 0)])

                dropped_zeros = len(ortho_ani_vals) - len(filter_o)
                pearson_coeff = scipy.stats.pearsonr(filter_o,filter_k)
                spearman_coeff = scipy.stats.spearmanr(filter_o,filter_k)

                lin_fit = np.poly1d(np.polyfit(filter_o, filter_k, 1))
                r2_val_raw = r2_score(filter_o,filter_k)
                r2_val_lin = r2_score(filter_o,[lin_fit(o) for o in filter_o])



                correlation_dataframe = pd.concat([filtered_correlation_dataframe,
                    pd.DataFrame({
                        "analysis_type": analysis_type,
                        "analysis_data_dir": analysis_data_dir,
                        "kmer_length": [kmer_length],
                        "c_val": [c_val],
                        "canonical": [canon],
                        "pearson_coeff" : [pearson_coeff.statistic],
                        "pearson_coeff_pval" : [pearson_coeff.pvalue],
                        "spearman_coeff" : [spearman_coeff.statistic],
                        "spearman_coeff_pval" : [spearman_coeff.pvalue],
                        "r2_val_raw": [r2_val_raw],
                        "r2_val_lin": [r2_val_lin],
                        "dropped_zeros": dropped_zeros
                    })],
                )

analysis_type='Single-Species', analysis_data_dir='Escherichia coli'
Using genome files ['../../data_temp/Single-Species/Escherichia coli/64.fna', '../../data_temp/Single-Species/Escherichia coli/32.fna', '../../data_temp/Single-Species/Escherichia coli/99.fna', '../../data_temp/Single-Species/Escherichia coli/43.fna', '../../data_temp/Single-Species/Escherichia coli/7.fna', '../../data_temp/Single-Species/Escherichia coli/98.fna', '../../data_temp/Single-Species/Escherichia coli/58.fna', '../../data_temp/Single-Species/Escherichia coli/90.fna', '../../data_temp/Single-Species/Escherichia coli/14.fna', '../../data_temp/Single-Species/Escherichia coli/41.fna', '../../data_temp/Single-Species/Escherichia coli/67.fna', '../../data_temp/Single-Species/Escherichia coli/73.fna', '../../data_temp/Single-Species/Escherichia coli/17.fna', '../../data_temp/Single-Species/Escherichia coli/56.fna', '../../data_temp/Single-Species/Escherichia coli/50.fna', '../../data_temp/Single-Species/Escherichi

In [None]:
print(ortho_ani_dataframe)

In [None]:
print(kmer_sketch_dataframe)

In [None]:
print(correlation_dataframe)

In [None]:
ortho_ani_dataframe.to_csv(ortho_ani_filename)
kmer_sketch_dataframe.to_csv(kmer_sketch_ani_filename)
correlation_dataframe.to_csv(corrleation_filename)

In [None]:
def box_plot_OrthoANI(
    analysis_type, 
    ortho_ani_dataframe
):
    plt.figure(figsize=(20,10),dpi=300)
    
    plt.title(f"Box Plot of OrthoANI values for {analysis_type}")
    
    
    filtered_ortho_ani_df = ortho_ani_dataframe[ortho_ani_dataframe["analysis_type"] == analysis_type]

    plt.boxplot(
        [filtered_ortho_ani_df[filtered_ortho_ani_df["analysis_data_dir"] == analysis_data_dir]["orthoani_val"] for analysis_data_dir in analysis_types[analysis_type]],
        labels=[analysis_data_dir for analysis_data_dir in analysis_types[analysis_type]],
    )
    plt.ylabel("OrthoAni Value")

    plt.savefig(f"../../plots/correlations/{analysis_type}-OrthoANI-boxplot.png")
    plt.close()
    

In [None]:
def plot_correlation(
        analysis_type, 
        correlation_dataframe,
        c_val_range,
        correlation_string,
    ):

    

    plt.figure(figsize=(10,10),dpi=300)
    plt.xlabel("K-mer Length")
    plt.ylabel(correlation_string)
    plt.title(f"{correlation_string} of {kmer_class.__name__} ANI Esimtation against K-mer Length on {analysis_type}") 
    for analysis_data_dir in analysis_types[analysis_type]:
        analysis_correlation_dataframe = correlation_dataframe[
            (correlation_dataframe["analysis_type"] == analysis_type) 
            & (correlation_dataframe["analysis_data_dir"] == analysis_data_dir)
        ]
        min_kmer_length, max_kmer_length = analysis_types[analysis_type][analysis_data_dir]
        kmer_lengths = [kmer_length for kmer_length in range(min_kmer_length,max_kmer_length+1)]
        canonical = True

        filtered_correlation_dataframe = analysis_correlation_dataframe[
            analysis_correlation_dataframe["canonical"] == canonical
        ]
        
        
        for c_val in c_val_range:
            correlation = list(
                filtered_correlation_dataframe[
                    (filtered_correlation_dataframe["c_val"] == c_val) &
                    (filtered_correlation_dataframe["kmer_length"].isin(set(kmer_lengths)))
                ][correlation_string]
            )
            print(len(kmer_lengths))
            print(len(correlation))
            # plt.scatter(kmer_lengths,correlation,label=f"{analysis_data_dir}",s=[3 for _ in range(len(kmer_lengths))])
            plt.plot(kmer_lengths,correlation,label=f"{analysis_data_dir}")

    plt.legend()
    plt.plot()
    plt.savefig(f"../../plots/correlations/{analysis_type}-{kmer_class.__name__}-{canonical=}-{correlation_string}.png")
    plt.close()


In [None]:
for analysis_type in analysis_types:
    for correlation_string in ["pearson_coeff","spearman_coeff","r2_val_raw","r2_val_lin","dropped_zeros"]:
        plot_correlation(
            analysis_type=analysis_type,
            correlation_dataframe=correlation_dataframe,
            c_val_range=c_val_range,
            correlation_string=correlation_string
        )
    
    box_plot_OrthoANI(
        analysis_type=analysis_type,
        ortho_ani_dataframe=ortho_ani_dataframe
    )

    # analysis_correlation_dataframe = correlation_dataframe[
    #     (correlation_dataframe["analysis_type"] == analysis_type) &
    #     (correlation_dataframe["analysis_data_dir"] == analysis_data_dir)
    # ]

    # for canonical in (True,):
    #     filtered_correlation_dataframe = analysis_correlation_dataframe[
    #         analysis_correlation_dataframe["canonical"] == canonical
    #     ]
    #     plt.figure(figsize=(10,10),dpi=300)
    #     plt.xlabel("K-mer Length")
    #     plt.ylabel("Pearson Correlation")
    #     plt.title("Pearson Correlation of FracMinHash ANI Esimtation against K-mer Length at different c-value thresholds")

    #     for c_val in c_val_range:
    #         pearson_correlation = list(
    #             filtered_correlation_dataframe[
    #                 (filtered_correlation_dataframe["c_val"] == c_val) &
    #                 (filtered_correlation_dataframe["kmer_length"].isin(set(range(min_kmer_length,max_kmer_length+1))))
    #             ]["pearson_coeff"]
    #         )
    #         print(len(kmer_lengths))
    #         print(len(pearson_correlation))
    #         plt.scatter(kmer_lengths,pearson_correlation,label=f"c = {c_val}")

    #     plt.legend()
    #     plt.plot()
    #     plt.savefig(f"../../plots/correlations/{analysis_type}-{analysis_data_dir}-{kmer_class.__name__}-{canonical=}-pearson.png")
    #     plt.close()




    #     plt.figure(figsize=(10,10),dpi=300)
    #     plt.xlabel("K-mer Length")
    #     plt.ylabel("Spearman Correlation")
    #     plt.title("Spearman Correlation of FracMinHash ANI Esimtation against K-mer Length at different c-value thresholds")

    #     for c_val in c_val_range:
    #         spearman_correlation = [ 
    #             filtered_correlation_dataframe[
    #                 (filtered_correlation_dataframe["c_val"] == c_val) &
    #                 (filtered_correlation_dataframe["kmer_length"] == kmer_length)
    #             ]["spearman_coeff"]
    #             for kmer_length in range(min_kmer_length,max_kmer_length+1)
    #         ]
    #         print(kmer_lengths)
    #         print(spearman_correlation)
    #         plt.scatter(kmer_lengths,spearman_correlation,label=f"c = {c_val}")

    #     plt.legend()
    #     plt.plot()
    #     plt.savefig(f"../../plots/correlations/{analysis_type}-{analysis_data_dir}-{kmer_class.__name__}-{canonical=}-spearman.png")
    #     plt.close()

            

        

In [None]:
import imageio

def plot_single_kmer(
        ortho_vals_df,
        kmer_vals_df,
        kmer_length,
):
    
    Path(f"../../plots/temp/{analysis_type}-{analysis_data_dir}/").mkdir(parents=True, exist_ok=True)
    combined_df = pd.merge(ortho_vals_df,kmer_vals_df)
    ortho_vals = combined_df["orthoani_val"]
    kmer_vals = combined_df["kmer_ani_val"]
    fig = plt.figure(figsize=(10,10),dpi=300)
    ax = fig.add_subplot()
    filter_o, filter_k = zip(*[(o,k) for o,k in zip(ortho_vals,kmer_vals) if (k > 0 and o > 0)])
    lin_fit = np.poly1d(np.polyfit(filter_o, filter_k, 1))
    ax.plot(filter_o,filter_o,"r-")
    ax.scatter(filter_o,filter_k,marker="^",s=[2 for _ in filter_o])
    # ax.scatter(filter_o,filter_o,marker="o",s=[2 for _ in filter_o])
    ax.plot(np.unique(filter_o), lin_fit(np.unique(filter_o)))
    plt.figtext(0.7,0.15,f"Zeros dropped : {len(ortho_vals) - len(filter_o)}")
    pearson_coeff = scipy.stats.pearsonr(filter_o,filter_k)

    plt.figtext(0.7,0.17,f"Pearson Corr. Coeff.: {pearson_coeff.statistic:.3f}")
    plt.figtext(0.7,0.19,f"Pearson Corr. p.: {pearson_coeff.pvalue:.3e}")



    ax.legend(["OrthoANI",f"{kmer_length}-mer dots",f"{kmer_length}-mer line : {lin_fit}"])

    plotname = f"../../plots/temp/{analysis_type}-{analysis_data_dir}/{analysis_type}-{analysis_data_dir}-{kmer_length}mer-estimated-ANI.png"
    plt.savefig(plotname)
    plt.close()
    return plotname

In [None]:
for analysis_type in analysis_types:
    for analysis_data_dir in analysis_types[analysis_type]:
        Path(f"../../plots/{analysis_type}-{analysis_data_dir}/").mkdir(parents=True, exist_ok=True)

        analysis_images = []
        min_kmer_length, max_kmer_length = analysis_types[analysis_type][analysis_data_dir]

        for kmer_length in range(min_kmer_length,max_kmer_length+1):
            for c_val in c_val_range:
                canon = True
                analysis_images.append(
                    imageio.v3.imread(
                        plot_single_kmer(
                            ortho_vals_df = ortho_ani_dataframe[
                                (ortho_ani_dataframe["analysis_type"] == analysis_type)
                                & (ortho_ani_dataframe["analysis_data_dir"] == analysis_data_dir)
                            ],
                            kmer_vals_df = kmer_sketch_dataframe[
                                (kmer_sketch_dataframe["analysis_type"] == analysis_type)
                                & (kmer_sketch_dataframe["analysis_data_dir"] == analysis_data_dir)
                                & (kmer_sketch_dataframe["kmer_length"] == kmer_length)
                                & (kmer_sketch_dataframe["c_val"] == c_val)
                                & (kmer_sketch_dataframe["canonical"] == canon)
                            ],
                            kmer_length= kmer_length,
                        )
                    )
                )
        
        print(len(analysis_images))
        plot_gif_filename = f"../../plots/{analysis_type}-{analysis_data_dir}/{analysis_type}-{analysis_data_dir}-estimated-ANI.gif"
        imageio.mimsave(plot_gif_filename,analysis_images,duration=0.5)