In [13]:
import polars as pl

In [14]:
df = pl.read_csv("optimized-sequences.csv")
df

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,str,str,str
4721,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000115947|ENSG0000011594…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…","""ATGAGCAGTCGTAAATCAAAGAGTAACAGC…",,"""ATGAGCAGCAGAAAGAGCAAGAGCAACAGC…",
39774,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000160953|ENSG0000016095…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…","""ATGGCGGATGCCAAGTATGTCCTCTGCCGA…",,"""ATGGCCGACGCCAAGTACGTGCTGTGCAGA…",
27306,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000112787|ENSG0000011278…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…","""ATGGAGGCCAAGGTCCGCCCGAGCCGGCGC…",,"""ATGGAGGCCAAGGTGAGACCCAGCAGAAGA…",
27148,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000198885|ENSG0000019888…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…","""ATGAATGTTGATGCAGAGGCCTCCATGGCT…",,"""ATGAACGTGGACGCCGAGGCCAGCATGGCC…",
20254,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000204410|ENSG0000020441…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…","""ATGGCCTCCTTAGGAGCGAACCCAAGGAGG…",,"""ATGGCCAGCCTCGGAGCCAACCCTAGAAGA…",
…,…,…,…,…,…,…,…
50708,"""iedb_antigen_aa_seqs.fa""","""sp|Q8IX19|MCEM1_HUMAN Mast cel…","""MEVEEIYKHQEVKMQAPAFRDKKQGVSAKN…","""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",,"""ATGGAGGTGGAGGAGATCTACAAGCACCAG…",
50709,"""iedb_antigen_aa_seqs.fa""","""sp|Q9D287|SPF27_MOUSE Pre-mRNA…","""MAGTGLVAGEVVVDALPYFDQGYEAPGVRE…","""ATGGCCGGCACCGGCCTGGTGGCCGGCGAG…",,"""ATGGCCGGCACAGGCCTGGTGGCTGGCGAG…",
50710,"""iedb_antigen_aa_seqs.fa""","""tr|Q4CN05|Q4CN05_TRYCC Trans-s…","""MSRHLFYSAVLLLLVVMCCGTAAVNAEELS…","""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",,"""ATGAGCAGACACCTGTTCTACAGCGCCGTG…",
50711,"""iedb_antigen_aa_seqs.fa""","""tr|Q5NG75|Q5NG75_FRATT Amino-a…","""MDNNQDKLKRDILSRHIVMISLGGTISASF…","""ATGGACAACAACCAGGACAAGCTGAAGAGA…",,"""ATGGACAACAACCAGGACAAGCTGAAGAGA…",


In [21]:
_STOP_CODONS = ["TAG", "TAA", "TGA"]

def compute_mean_csc(seq: str, ref_csc: dict) -> float:
    if not seq:
        return None
    n = len(seq)
    if n % 3 != 0:
        raise ValueError("Sequence codon count not divisible by 3!")
    codon_count = n / 3
    csc_total = 0
    for i in range(0, n, 3):
        codon = seq[i:i+3]
        # exit if last codon is stop
        if codon in _STOP_CODONS:
            # print(f"Stop codon detected: {codon}")
            codon_count -= 1 # adjust codon counts for stop codon
            break
        csc_total += ref_csc[codon]
    return csc_total / codon_count


# implement CSC
infile = "elife-45396-fig1-data2.csv"
_ref_csc = pl.read_csv(infile)
# 293T_ORFome is likely relevant reference
outfile = "csc_reference_wu_et_al.csv"
# _ref_csc[['codon', '293T_ORFome']].to_csv(open(outfile, 'w'), sep=";", header=False, index=False)
ref_csc = {row['codon']: row['293T_ORFome'] for row in _ref_csc.rows(named=True)}

In [23]:
csc = [
    {
        "index": row["index"],
        "input_csc": compute_mean_csc(row["input_sequence"], ref_csc),
        "optimized_csc": compute_mean_csc(row["optimized_sequence"], ref_csc),
    }
    for row in df.rows(named=True)
]
output = pl.DataFrame(csc)
output.write_csv("output-csc.csv")
output

index,input_csc,optimized_csc
i64,f64,f64
4721,-0.023769,0.033117
39774,0.006562,0.023682
27306,0.026442,0.014703
27148,0.01258,0.034312
20254,0.007905,0.0336
…,…,…
50708,0.032597,0.031179
50709,0.031202,0.027479
50710,0.038305,0.034318
50711,0.050564,0.047255
