In [1]:
import polars as pl

from mrnarchitect.utils.fasta import parse_fasta_file

In [2]:
_FILES = [
    "antibody_monoclonal_aa_seqs.fasta",
    "ena_cancer_vaccine_seq.fa",
    "ena_car-t_nt_seq.fa",
    "ensembl_ccds_tx_nt_seq.fa",
    "hrt_hk_nt_seq_pulled_from_ensembl.fa",
    "iedb_antigen_aa_seqs.fa",
]

dataframes =[]
for file in _FILES:
    sequences = []
    for name, raw_sequence, sequence, error in parse_fasta_file(file):
        sequences.append({
            "source": file,
            "name": name,
            "raw_input_sequence": raw_sequence,
            "input_sequence": str(sequence),
            "error": error,
        })
    dataframe = pl.DataFrame(
        sequences,
        schema_overrides={"error": pl.String | None}
    ).with_columns(
        error=pl.when(
            pl.col("error").is_null(),
            pl.col("input_sequence").str.len_bytes() % 3 != 0
        ).then(
            pl.lit("Sequence is not a valid amino acid sequence (length % 3 != 0).")
        ).otherwise(
            pl.col("error")
        )
    )
    dataframes.append(dataframe)
df = pl.concat(dataframes).with_row_index()
df.write_csv("input-sequences-full.csv")
df

ValueError: not enough values to unpack (expected 4, got 2)

In [24]:
_SAMPLE_SIZE = 3000
_MAX_NT_LENGTH = 7000

dataframes2 = []
sources = set(df["source"].to_list())
print(sources)
for source in sources:
    df_ = df.filter(
        pl.col("source") == source,
        pl.col("error").is_null(),
        pl.col("input_sequence").str.len_bytes() < _MAX_NT_LENGTH
    )
    if df_.select(pl.len()).item() > _SAMPLE_SIZE:
        df_ = df_.sample(_SAMPLE_SIZE)
    dataframes2.append(df_)
df2 = pl.concat(dataframes2)
df2.write_csv("input-sequences.csv")
df2

{'ensembl_ccds_tx_nt_seq.fa', 'antibody_monoclonal_aa_seqs.fasta', 'hrt_hk_nt_seq_pulled_from_ensembl.fa', 'ena_car-t_nt_seq.fa', 'ena_cancer_vaccine_seq.fa', 'iedb_antigen_aa_seqs.fa'}


index,source,name,raw_input_sequence,input_sequence,error
u32,str,str,str,str,str
4721,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000115947|ENSG0000011594…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…",
39774,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000160953|ENSG0000016095…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…",
27306,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000112787|ENSG0000011278…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…",
27148,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000198885|ENSG0000019888…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…",
20254,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000204410|ENSG0000020441…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…",
…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",
