# Data preprocessing example codes

### 1. Use Cogent3 to generate both Newick format phylogenetic tree file and pairwise distance matrix in TSV format

In [None]:
from cogent3 import load_unaligned_seqs
from cogent3.align.progressive import tree_align
import os

consensus_fasta_file = "merged.consensus.fa"  # Replace with your file path
tree_newick_file = "samples.tn93.tree"  # Replace with your file path
dists_csv_file = "samples.dist.csv"  # Replace with your file path

seqs = load_unaligned_seqs(consensus_fasta_file, moltype="dna")
aln, tree = tree_align("TN93", seqs, show_progress=False)
tree.write(tree_newick_file)

dists = aln.distance_matrix(calc="tn93", show_progress=False)
dists_df = dists.to_table().to_dataframe()
dists_df.to_csv(dists_csv_file)

### 2. Convert the MASH distances to a pairwise distance matrix in TSV and also generate phylogenetic tree in Newick

In [None]:
import pandas as pd
import os
import numpy as np

# MASH distance --> pairwise distance matrix in tsv
distances = pd.read_csv("mash_distances.txt", sep="\t")  # Replace with your file path
distances.columns = ["sample1", "sample2", "distance", 'p-value', 'Matching Hashes / Total Sketch Size']
# remove ".fasta" from the sample names in first and second columns
distances["sample1"] = distances["sample1"].str.replace(".fasta", "")
distances["sample2"] = distances["sample2"].str.replace(".fasta", "")
df = distances.pivot(index="sample1", columns="sample2", values="distance")
df.to_csv("mash_distances.tsv", sep="\t")

# pairwise distance matrix in tsv --> phylogenetic tree in Newick
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
import pandas as pd

# Load the pairwise distance matrix from a TSV file
def load_distance_matrix(file_path):
    df = pd.read_csv(file_path, sep='\t', index_col=0)
    names = list(df.columns)
    lower_triangle_matrix = [[0]] + [list(df.iloc[i, :i+1]) for i in range(1, len(names))]
    return DistanceMatrix(names, lower_triangle_matrix)

# Construct a phylogenetic tree using the UPGMA algorithm
def construct_tree(distance_matrix):
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(distance_matrix)
    return tree

# Write the tree to a Newick file
def write_tree_to_newick(tree, output_file):
    Phylo.write(tree, output_file, 'newick')

input_file = 'mash_distances.tsv'  # Replace with your file path
output_file = 'mash_tree.newick'   # Replace with your file path

distance_matrix = load_distance_matrix(input_file)
tree = construct_tree(distance_matrix)
write_tree_to_newick(tree, output_file)

### 3. 