In [34]:
import pandas as pd
import h5py
from pyfaidx import Fasta

In [1]:
csv = "3and6_10char.csv"
fasta = "3and6_10char.fasta"
h5 = "3and6_10char.h5"

In [51]:
from typing import Generator
from math import ceil, log
from itertools import product

def get_new_seq_ids(nr_seqs: int) -> Generator[str, None, None]:
    # get number of sequences and calculate how many letters are needed to
    # uniquely identify all sequences
    alphabet = "".join(map(chr, range(ord("A"), ord("Z") + 1)))
    nr_letters_needed = ceil(log(nr_seqs, len(alphabet)))

    # new identifier generator (e.g. AA, AB, AC, ...)
    new_seq_ids = (
        "".join(combi)
        for combi in product(alphabet, repeat=nr_letters_needed)
    )
    return new_seq_ids

In [81]:
df = pd.read_csv(csv)
# df.drop(columns=["new_10_Char", "10 char .phy format name"])
df = df.sort_values("new_10_Char")
mapped_id = list(get_new_seq_ids(nr_seqs=len(df)))[:len(df)]
df.insert(0, "mapped_id", mapped_id)
df.head(2)
# len(df)

Unnamed: 0,mapped_id,new_10_Char,10 char .phy format name,Original fasta header,Name (existing or suggested),Major group,Species,Major taxon for the purposes of this study
292,AAA,AAAAAAAAAA,Cbivi_3FTx_000,Cbivi_3FTx_000,-,3FTx,Calliophis bivirgatus,Elapidae
386,AAB,AAAAAAAAAB,Erythrolamprus_poecilogyrus_A7X3M9,Erythrolamprus_poecilogyrus_A7X3M9,-,3FTx,Erythrolamprus poecilogyrus,Colubridae


In [82]:
new_fasta = "3FTx.fasta"

fasta_handler = Fasta(fasta)
with open(new_fasta, "w") as f_out:
    for header, seq in fasta_handler.items():
        new_header = df.loc[df["new_10_Char"] == header, "mapped_id"].values[0]
        f_out.write(f">{new_header}\n")
        f_out.write(f"{seq}\n")

In [83]:
new_h5 = "3FTx.h5"

with h5py.File(h5, "r") as in_hdf, h5py.File(new_h5, "w") as out_hdf:
    for idx, (header, emb) in enumerate(in_hdf.items()):
        new_header = df.loc[df["new_10_Char"] == header, "mapped_id"].values[0]
        try:
            out_hdf.create_dataset(name=new_header, data=emb)
        except ValueError:
            print(new_header, header)

In [84]:
df_new = df.drop(columns=["new_10_Char", "10 char .phy format name"])
df_new = df_new.rename(columns={"Original fasta header": "original_id"})
df_new.to_csv("3FTx_mapped.csv", index=False)

# Up to date data

In [12]:
import pandas as pd

path = "new/SM_table.csv"

df = pd.read_csv(path, sep=";")
df["Alphabeticode"] = df["Alphabeticode"].str[1:]
df

Unnamed: 0,Alphabeticode,TM prediction,Cell localisation prediction,Original fasta header,Name in fasta,Name (existing or suggested),Major group,Genomic toxins,Preliminary cysteine group,Evolutionary order,...,Dimeric Toxins A subunit (pos 93),Dimeric Toxins B subunit (pos 53),Long chain cysteine 1 (pos 85),Long chain cysteine 2 (pos 89),Basal,Dimeric,Derived,Short-chain,Long-chain,canonical cysteine weirdness
0,AAAAAAAAAA,Soluble,Extracellular,Cbivi_3FTx_000,Cbivi_3FTx_000,-,3FTx,,Secretory,Basal,...,T,-,N,T,True,False,False,False,False,False
1,AAAAAAAAAB,Soluble,Extracellular,Erythrolamprus_poecilogyrus_A7X3M9,Erythrolamprus_poecilogyrus_A7X3M9,-,3FTx,,Secretory,Basal,...,M,-,T,T,True,False,False,False,False,False
2,AAAAAAAAAC,Soluble,Extracellular,Boiga_irregularis_Brisbane_GGUD01000006,Boiga_irregularis_Brisbane_GGUD01000006,-,3FTx,,Secretory,Weird,...,L,-,S,L,False,False,False,False,False,True
3,AAAAAAAAAD,Soluble,Extracellular,Boiga_nigriceps_GGUF01000001,Boiga_nigriceps_GGUF01000001,-,3FTx,,Secretory,Weird,...,L,-,S,L,False,False,False,False,False,True
4,AAAAAAAAAE,Soluble,Extracellular,Telescopus_dhara_A7X3N6,Telescopus_dhara_A7X3N6,-,3FTx,,Secretory,Weird,...,L,-,S,M,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,AAAAAAABMZ,Soluble,Extracellular,sp|Q7T2I5|3S1EC_LATSE Erabutoxin c OS=Laticaud...,995sp|Q7T2,Erabutoxin c,3FTx,,Secretory,Short-chain,...,F,-,S,-,False,False,True,True,False,False
987,AAAAAAABNA,Soluble,Extracellular,sp|Q90VW1|3S1EB_LATSE Erabutoxin b OS=Laticaud...,997sp|Q90V,Erabutoxin b,3FTx,,Secretory,Short-chain,...,F,-,S,-,False,False,True,True,False,False
988,AAAAAAABNB,Soluble,Extracellular,sp|Q9YGC7|3S15_LATCO Short neurotoxin NCA02/NC...,999sp|Q9YG,Short neurotoxin NCA02/NCA05/UER05,3FTx,,Secretory,Short-chain,...,H,-,R,-,False,False,True,True,False,False
989,AAAAAAABNC,Membrane,Cell_membrane,Chmy_7 Chelonia mydas unplaced genomic scaffol...,99Chmy_7_C,-,Ly-6,,Non-secretory,Ly-6,...,H,S,G,G,True,False,False,False,False,False


In [18]:
dup_mask = df.duplicated(subset='Original fasta header', keep='first')#.sum()
df[dup_mask].sort_values('Original fasta header')

Unnamed: 0,Alphabeticode,TM prediction,Cell localisation prediction,Original fasta header,Name in fasta,Name (existing or suggested),Major group,Genomic toxins,Preliminary cysteine group,Evolutionary order,...,Dimeric Toxins A subunit (pos 93),Dimeric Toxins B subunit (pos 53),Long chain cysteine 1 (pos 85),Long chain cysteine 2 (pos 89),Basal,Dimeric,Derived,Short-chain,Long-chain,canonical cysteine weirdness
348,AAAAAAAAOG,Soluble,Extracellular,Boiga_cynodon_A0A193CHL1,1333Boiga_,-,3FTx,,Secretory,Basal,...,G,S,P,-,True,False,False,False,False,False
327,AAAAAAAANL,Soluble,Extracellular,Boiga_irregularis_A0A0B8RVB7,1293Boiga_,-,3FTx,,Secretory,Basal,...,G,C,P,-,True,False,False,False,False,False
324,AAAAAAAANI,Soluble,Extracellular,Boiga_irregularis_A0A0B8RZX5,1287Boiga_,-,3FTx,,Secretory,Weird,...,G,C,P,-,False,False,False,False,False,True
352,AAAAAAAAOK,Soluble,Extracellular,Boiga_irregularis_A0A0B8RZX6,1341Boiga_,-,3FTx,,Secretory,Basal,...,C,S,P,-,True,True,False,False,False,False
379,AAAAAAAAPL,Soluble,Extracellular,Boiga_nigriceps_A0A193CHL8,1395Boiga_,-,3FTx,,Secretory,Basal,...,C,P,P,-,True,True,False,False,False,False
332,AAAAAAAANQ,Soluble,Extracellular,Oxybelis_fulgidus_A0A193CHK9,1301Oxybel,-,3FTx,,Secretory,Basal,...,G,S,S,-,True,False,False,False,False,False
296,AAAAAAAALN,Soluble,Extracellular,Telescopus_dhara_A7X3S5,1201Telesc,-,3FTx,,Secretory,Weird,...,G,-,P,-,False,False,False,False,False,True
297,AAAAAAAALO,Soluble,Extracellular,Telescopus_dhara_A7X3S8,1203Telesc,-,3FTx,,Secretory,Basal,...,G,-,P,-,True,False,False,False,False,False
333,AAAAAAAANR,Soluble,Extracellular,Trimorphodon_lambda_A0A193CHM1,1303Trimor,-,3FTx,,Secretory,Basal,...,G,C,P,-,True,False,False,False,False,False
383,AAAAAAAAPP,Soluble,Extracellular,Trimorphodon_lambda_A0A193CHM4,1403Trimor,-,3FTx,,Secretory,Basal,...,G,S,P,-,True,False,False,False,False,False
