In [4]:
import polars as pl

In [5]:
df = pl.read_csv("optimized-sequences.csv")
df

index,source,name,raw_input_sequence,input_sequence,error,optimized_sequence,optimization_error
i64,str,str,str,str,str,str,str
0,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_heav…","""QVQLQQPGAELVKPGASVKMSCKASGYTFT…","""CAGGTGCAGCTGCAGCAGCCCGGCGCCGAG…",,"""CAGGTGCAGCTCCAGCAGCCAGGCGCCGAA…",
1,"""antibody_monoclonal_aa_seqs.fa…","""RituximabTargetAnti-CD20v_ligh…","""QIVLSQSPAILSASPGEKVTMTCRASSSVS…","""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",,"""CAGATCGTGCTGAGCCAGAGCCCCGCCATC…",
2,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Lig…","""DIQMTQSPSSLSASVGDRVTITCRASQDVN…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
3,"""antibody_monoclonal_aa_seqs.fa…","""TrastuzumabTargetAnti-HER2_Hea…","""EVQLVESGGGLVQPGGSLRLSCAASGFNIK…","""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",,"""GAGGTGCAGCTGGTGGAGAGCGGCGGCGGC…",
4,"""antibody_monoclonal_aa_seqs.fa…","""Bevacizumab_light_chain""","""DIQMTQSPSSLSASVGDRVTITCSASQDIS…","""GACATCCAGATGACCCAGAGCCCCAGCAGC…",,"""GACATCCAGATGACCCAGAGCCCCAGCTCC…",
…,…,…,…,…,…,…,…
47568,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000173846|ENSG0000017384…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…","""ATGGAGCCTGCCGCCGGTTTCCTGTCTCCG…",,"""ATGGAGCCTGCCGCTGGCTTTCTGTCTCCT…",
47569,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000142166|ENSG0000014216…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…","""ATGGATAATTGGATAAAATTGTCTGGGTGT…",,"""ATGGACAACTGGATCAAGCTGAGCGGCTGC…",
47570,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000180488|ENSG0000018048…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…","""ATGTCAGACTGCTGCTCAGCGCCAGGCATC…",,"""ATGAGCGACTGCTGCAGCGCCCCCGGCATC…",
47571,"""ensembl_ccds_tx_nt_seq.fa""","""ENSG00000145216|ENSG0000014521…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…","""ATGTCGGCCGGCGAGGTCGAGCGCCTAGTG…",,"""ATGAGCGCCGGCGAGGTCGAGAGACTGGTG…",


In [6]:
_STOP_CODONS = ["TAG", "TAA", "TGA"]

def compute_mean_csc(seq: str, ref_csc: dict) -> float:
    if not seq:
        return None
    n = len(seq)
    if n % 3 != 0:
        raise ValueError("Sequence codon count not divisible by 3!")
    codon_count = n / 3
    csc_total = 0
    for i in range(0, n, 3):
        codon = seq[i:i+3]
        # exit if last codon is stop
        if codon in _STOP_CODONS:
            # print(f"Stop codon detected: {codon}")
            codon_count -= 1 # adjust codon counts for stop codon
            break
        csc_total += ref_csc[codon]
    return csc_total / codon_count


# implement CSC
infile = "elife-45396-fig1-data2.csv"
_ref_csc = pl.read_csv(infile)
# 293T_ORFome is likely relevant reference
# outfile = "csc_reference_wu_et_al.csv"
# _ref_csc[['codon', '293T_ORFome']].to_csv(open(outfile, 'w'), sep=";", header=False, index=False)
ref_csc = {row['codon']: row['293T_ORFome'] for row in _ref_csc.rows(named=True)}

In [7]:
csc = [
    {
        "index": row["index"],
        "input_csc": compute_mean_csc(row["input_sequence"], ref_csc),
        "optimized_csc": compute_mean_csc(row["optimized_sequence"], ref_csc),
    }
    for row in df.rows(named=True)
]
output = pl.DataFrame(csc)
output.write_csv("output-csc.csv")
output

index,input_csc,optimized_csc
i64,f64,f64
0,0.037988,0.033325
1,0.036312,0.035658
2,0.0358,0.034948
3,0.037603,0.034451
4,0.036155,0.035877
…,…,…
47568,0.016918,0.028832
47569,-0.026666,0.034426
47570,-0.018039,0.032813
47571,-0.023399,0.019154
