In [34]:
import pandas as pd
import h5py
from pyfaidx import Fasta

In [1]:
csv = "3and6_10char.csv"
fasta = "3and6_10char.fasta"
h5 = "3and6_10char.h5"

In [51]:
from typing import Generator
from math import ceil, log
from itertools import product

def get_new_seq_ids(nr_seqs: int) -> Generator[str, None, None]:
    # get number of sequences and calculate how many letters are needed to
    # uniquely identify all sequences
    alphabet = "".join(map(chr, range(ord("A"), ord("Z") + 1)))
    nr_letters_needed = ceil(log(nr_seqs, len(alphabet)))

    # new identifier generator (e.g. AA, AB, AC, ...)
    new_seq_ids = (
        "".join(combi)
        for combi in product(alphabet, repeat=nr_letters_needed)
    )
    return new_seq_ids

In [81]:
df = pd.read_csv(csv)
# df.drop(columns=["new_10_Char", "10 char .phy format name"])
df = df.sort_values("new_10_Char")
mapped_id = list(get_new_seq_ids(nr_seqs=len(df)))[:len(df)]
df.insert(0, "mapped_id", mapped_id)
df.head(2)
# len(df)

Unnamed: 0,mapped_id,new_10_Char,10 char .phy format name,Original fasta header,Name (existing or suggested),Major group,Species,Major taxon for the purposes of this study
292,AAA,AAAAAAAAAA,Cbivi_3FTx_000,Cbivi_3FTx_000,-,3FTx,Calliophis bivirgatus,Elapidae
386,AAB,AAAAAAAAAB,Erythrolamprus_poecilogyrus_A7X3M9,Erythrolamprus_poecilogyrus_A7X3M9,-,3FTx,Erythrolamprus poecilogyrus,Colubridae


In [82]:
new_fasta = "3FTx.fasta"

fasta_handler = Fasta(fasta)
with open(new_fasta, "w") as f_out:
    for header, seq in fasta_handler.items():
        new_header = df.loc[df["new_10_Char"] == header, "mapped_id"].values[0]
        f_out.write(f">{new_header}\n")
        f_out.write(f"{seq}\n")

In [83]:
new_h5 = "3FTx.h5"

with h5py.File(h5, "r") as in_hdf, h5py.File(new_h5, "w") as out_hdf:
    for idx, (header, emb) in enumerate(in_hdf.items()):
        new_header = df.loc[df["new_10_Char"] == header, "mapped_id"].values[0]
        try:
            out_hdf.create_dataset(name=new_header, data=emb)
        except ValueError:
            print(new_header, header)

In [84]:
df_new = df.drop(columns=["new_10_Char", "10 char .phy format name"])
df_new = df_new.rename(columns={"Original fasta header": "original_id"})
df_new.to_csv("3FTx_mapped.csv", index=False)