# convert cleavage scores

Converts TSV with one position per row to a TSV where one row is one isotype

In [1]:
import numpy as np

import polars as pl
from polars import col

In [2]:
input_fn = "cleavage_scores_bo_seq.tsv"

In [3]:
df = pl.read_csv(input_fn, columns=[0,1,6], separator="\t")

In [4]:
isotypes = []
for item in df.select("isotype").unique().sort("isotype").to_dicts():
    isotype = item["isotype"]
    isotypes.append(isotype)

print(isotypes)

['tRNA-Ala-AGC', 'tRNA-Ala-CGC', 'tRNA-Ala-TGC', 'tRNA-Arg-CCG', 'tRNA-Arg-CCT', 'tRNA-Arg-TCG', 'tRNA-Arg-TCT', 'tRNA-Asn-GTT', 'tRNA-Asp-GTC', 'tRNA-Cys-GCA', 'tRNA-Gln-CTG', 'tRNA-Glu-CTC', 'tRNA-Glu-TTC', 'tRNA-Gly-CCC', 'tRNA-Gly-GCC', 'tRNA-Gly-TCC', 'tRNA-His-GTG', 'tRNA-Ile-AAT', 'tRNA-Ile-TAT', 'tRNA-Leu-AAG', 'tRNA-Leu-CAA', 'tRNA-Leu-CAG', 'tRNA-Leu-TAG', 'tRNA-Lys-CTT', 'tRNA-Lys-TTT', 'tRNA-Met-CAT', 'tRNA-Phe-GAA', 'tRNA-Pro-TGG', 'tRNA-Ser-GCT', 'tRNA-Thr-AGT', 'tRNA-Thr-TGT', 'tRNA-Trp-CCA', 'tRNA-Tyr-GTA', 'tRNA-Val-CAC', 'tRNA-Val-TAC', 'tRNA-iMet-CAT']


In [5]:


a = np.zeros((60,), dtype=np.float64)

cleavages_dict = {}
for isotype in isotypes:
    cleavages_dict[isotype] = np.copy(a)    


for item in df.to_dicts():
    isotype = item["isotype"]
    start = item["start"]
    score = item["cleavage_score"]
    cleavages_dict[isotype][start-1] = score

for key in cleavages_dict.keys():
    cleavages_dict[key] = cleavages_dict[key].tolist()

new_df = pl.DataFrame(cleavages_dict).transpose(include_header=True)
#new_df.select(
#    pl.all().map_alias(lambda col_name: col_name.split('base_').list.last())
#)
old_columns = new_df.columns
#print(old_columns)
new_columns = []
for name in old_columns:
    if name.startswith("column_"):
        base_num = int(name.replace("column_", "")) + 1
        new_columns.append(f"base_{base_num}")
    else:
        new_columns.append("trna_isotype")
#print(new_columns)
new_df.columns = new_columns
new_df = new_df.drop("base_1	base_2	base_3	base_4	base_5	base_6	base_7	base_8	base_9".split())
new_df
new_df.write_csv("cleavage_scores_10_60.tsv", separator="\t")