In [4]:
import polars as pl

from mrnarchitect.utils.fasta import parse_fasta_file

In [5]:
_FILES = [
    "antibody_monoclonal_aa_seqs.fasta",
    "ena_cancer_vaccine_seq.fa",
    "ena_car-t_nt_seq.fa",
    "ensembl_ccds_tx_nt_seq.fa",
    "hrt_hk_nt_seq_pulled_from_ensembl.fa",
    "iedb_antigen_aa_seqs.fa",
]

dataframes =[]
for file in _FILES:
    sequences = []
    for name, raw_sequence, sequence, error in parse_fasta_file(file):
        sequences.append({
            "source": file,
            "name": name,
            "raw_input_sequence": raw_sequence,
            "input_sequence": str(sequence) if sequence else None,
            "error": error,
        })
    dataframe = pl.DataFrame(
        sequences,
        schema_overrides={
            "input_sequence": pl.String | None,
            "error": pl.String | None
        }
    ).with_columns(
        error=pl.when(
            pl.col("error").is_null(),
            pl.col("input_sequence").str.len_bytes() % 3 != 0
        ).then(
            pl.lit("Sequence is not a valid amino acid sequence (length % 3 != 0).")
        ).otherwise(
            pl.col("error")
        )
    )
    dataframes.append(dataframe)
df = pl.concat(dataframes).with_row_index()
df.write_csv("input-sequences-full.csv")
df

index,source,name,raw_input_sequence,input_sequence,error
u32,str,str,str,str,str
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",
…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",


In [7]:
_SAMPLE_SIZE = 100_000
_MAX_NT_LENGTH = 10000

dataframes2 = []
sources = set(df["source"].to_list())
print(sources)
for source in sources:
    df_ = df.filter(
        pl.col("source") == source,
        pl.col("error").is_null(),
        pl.col("input_sequence").str.len_bytes() < _MAX_NT_LENGTH
    )
    if df_.select(pl.len()).item() > _SAMPLE_SIZE:
        df_ = df_.sample(_SAMPLE_SIZE)
    dataframes2.append(df_)
df2 = pl.concat(dataframes2)
df2.write_csv("input-sequences.csv")
df2

{'antibody_monoclonal_aa_seqs.fasta', 'ena_car-t_nt_seq.fa', 'hrt_hk_nt_seq_pulled_from_ensembl.fa', 'iedb_antigen_aa_seqs.fa', 'ena_cancer_vaccine_seq.fa', 'ensembl_ccds_tx_nt_seq.fa'}


index,source,name,raw_input_sequence,input_sequence,error
u32,str,str,str,str,str
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",
…,…,…,…,…,…
47568,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000173846|ENSG0000017384…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…",
47569,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000142166|ENSG0000014216…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…",
47570,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000180488|ENSG0000018048…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…",
47571,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145216|ENSG0000014521…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…",
