### Taxonomy dataframe wrangling
- Get the taxonomy for NCBI Refseq RdRp sequences
- Compare Refseq to genomad taxonomy
- Manually curate the comparison file
- Write file with sequence names that places seqs in viral family (so one file per fam)
- Use these files for family taxonomy trees


In [None]:
# create a df with for each NCBI sequence the taxonomy
# Taxonomy comes from xml file

xml_file = 'phage.xml'
df = extract_xml_data(xml_file)
df.to_csv('taxonomy.csv')

# extract taxonomy from NCBI xml
def extract_xml_data(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Initialize lists to store data
    accession_versions = []
    organisms = []
    taxonomies = []

    # Find all INSDSeq records
    for record in root.findall('.//INSDSeq'):
        # Extract required fields
        accession_version = record.find('INSDSeq_accession-version').text
        organism = record.find('INSDSeq_organism').text
        taxonomy = record.find('INSDSeq_taxonomy').text

        # Append data to lists
        accession_versions.append(accession_version)
        organisms.append(organism)
        taxonomies.append(taxonomy)

    # Create DataFrame
    df = pd.DataFrame({
        'INSDSeq_accession-version': accession_versions,
        'INSDSeq_organism': organisms,
        'INSDSeq_taxonomy': taxonomies
    })

    return df


In [None]:
# merge genomad with the blast output, and only keep the deduplicated sequeces
# open just created taxonomy df
df_tax = pd.read_csv('./taxonomy.csv')

# genomad out
df_genomad = pd.read_csv('./genomad_out.csv')

# list of deduplicated seqs
dedup = pd.read_csv('./trees/headers_dedup.txt')
# merge with genomad output
df = df_genomad.merge(dedup, on='gene', how='inner')

# blastP out
blast = pd.read_csv('./trees/refseq.genomad.blastp.tsv', sep='\t')

# merge blast results with NCBI taxonomy
blast = blast.merge(df_tax, on='accession', how='outer')
blast = blast[['gene', 'accession', 'Expected value', 'INSDSeq_taxonomy']]
blast.rename(columns={'INSDSeq_taxonomy': 'blast_taxonomy'}, inplace=True)

# merge genomad and blast output
df = df.merge(blast, on='gene', how='outer')

# manually curate the output to only retain family level matches, 
#and keeping either blast result or genomad result for taxonomy, depending on e-value and taxonomic rank 
# i.e. only keeping ranks at family or lower

In [None]:
# Load in the manually curated file, and write lists of sequences that are in 
# specific viral families to create trees

df = pd.read_csv('./trees/mancur_family_tax.csv')

# Iterate through each unique taxname in df_own
for taxname in df_own['taxname'].unique():
    # Get the rows in df_own corresponding to the current taxname
    own_rows = df_own[df_own['taxname'] == taxname]
    # Check if taxname is a substring of any INSDSeq_taxonomy in df_tax
    match = df_tax[df_tax['INSDSeq_taxonomy'].str.contains(taxname, na=False)]
    # If there is a match, write the corresponding INSDSeq_accession-version and gene to a text file
    if not match.empty:
        matching_accession_versions = match['accession'].tolist()
        matching_genes = own_rows['gene'].tolist()
        # Create a filename based on the taxname
        filename = f"trees/family_trees/subfams/{taxname}.txt"
        # Write the matching accession versions and genes to the text file
        with open(filename, 'w') as file:
            for accession in matching_accession_versions:
                file.write(accession + "\n")
            for gene in matching_genes:
                file.write(gene + "\n")