In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
SPs = [f'{day}{rep}' for day in ['BG','BR','WH'] for rep in ['a','b']]

In [3]:
for SP in SPs:
    codon_count = pd.read_csv(f'/Data_2/Daehwa/Adipocyte/Other_papers/Martinez_etal.2023.Cell_Metabolism/Analysis/Ribosome_stalling/v20230827/Martinez_{SP}.codon-count.tsv', sep='\t', usecols=['transcript_id','reltostart-asite','count'])
    codon_count = codon_count.rename(columns={'count':f'{SP}_n'})

    if SP=='BGa':
        total_codon_count = codon_count.copy()
    else:
        total_codon_count = pd.merge(total_codon_count, codon_count, on=['transcript_id','reltostart-asite'], how='outer')

total_codon_count = total_codon_count.sort_values(['transcript_id','reltostart-asite']).reset_index(drop=True).replace(np.nan, 0)
display(total_codon_count)

Unnamed: 0,reltostart-asite,transcript_id,BGa_n,BGb_n,BRa_n,BRb_n,WHa_n,WHb_n
0,33,ENSMUST00000000001.5,0.0,0.0,1.0,0.0,0.0,0.0
1,39,ENSMUST00000000001.5,0.0,0.0,3.0,0.0,0.0,0.0
2,42,ENSMUST00000000001.5,0.0,0.0,0.0,0.0,0.0,1.0
3,45,ENSMUST00000000001.5,0.0,0.0,1.0,0.0,0.0,0.0
4,48,ENSMUST00000000001.5,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
2960511,1509,ENSMUST00000239563.2,0.0,0.0,0.0,0.0,0.0,1.0
2960512,1512,ENSMUST00000239563.2,0.0,0.0,2.0,0.0,0.0,1.0
2960513,1521,ENSMUST00000239563.2,0.0,0.0,10.0,4.0,7.0,1.0
2960514,1524,ENSMUST00000239563.2,0.0,0.0,1.0,2.0,1.0,1.0


In [4]:
slct_codon_table = pd.DataFrame(columns=['transcript_id','reltostart-asite','codon_num']+[SP+'_n' for SP in SPs]+[SP+'_N' for SP in SPs])
for T_ID, table in total_codon_count.groupby('transcript_id'):
    continue_flag = False

    table = table.set_index('reltostart-asite')
    table = table.reindex(range(table.index.min(), table.index.max()+1, 3), fill_value=0).reset_index()
    table['transcript_id'] = T_ID
    # print(T_ID)

    for day in ['BG','BR','WH']:
        avg_sum = table[[f'{day}{rep}_n' for rep in ['a','b']]].sum().sum()/2
        # print(f'{day} {avg_sum:.1f}, {avg_sum / len(table):.3f}')
        if avg_sum / len(table) < 0.5 : continue_flag = True
        if avg_sum < 100 : continue_flag = True
    if continue_flag : continue

    for SP in SPs:
        table[f'{SP}_N'] = sum(table[f'{SP}_n'])
    table['codon_num'] = len(table)
    
    slct_codon_table = pd.concat([slct_codon_table, table])

slct_codon_table.to_csv('Martinez_codon-count.tsv', sep='\t', index=False)
display(slct_codon_table)

Unnamed: 0,transcript_id,reltostart-asite,codon_num,BGa_n,BGb_n,BRa_n,BRb_n,WHa_n,WHb_n,BGa_N,BGb_N,BRa_N,BRb_N,WHa_N,WHb_N
0,ENSMUST00000000001.5,33,334,0.0,0.0,1.0,0.0,0.0,0.0,214.0,816.0,2298.0,421.0,2892.0,613.0
1,ENSMUST00000000001.5,36,334,0.0,0.0,0.0,0.0,0.0,0.0,214.0,816.0,2298.0,421.0,2892.0,613.0
2,ENSMUST00000000001.5,39,334,0.0,0.0,3.0,0.0,0.0,0.0,214.0,816.0,2298.0,421.0,2892.0,613.0
3,ENSMUST00000000001.5,42,334,0.0,0.0,0.0,0.0,0.0,1.0,214.0,816.0,2298.0,421.0,2892.0,613.0
4,ENSMUST00000000001.5,45,334,0.0,0.0,1.0,0.0,0.0,0.0,214.0,816.0,2298.0,421.0,2892.0,613.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3476,ENSMUST00000239525.1,10461,3481,0.0,2.0,0.0,0.0,0.0,0.0,672.0,3131.0,4406.0,0.0,4789.0,527.0
3477,ENSMUST00000239525.1,10464,3481,0.0,0.0,1.0,0.0,3.0,0.0,672.0,3131.0,4406.0,0.0,4789.0,527.0
3478,ENSMUST00000239525.1,10467,3481,1.0,3.0,0.0,0.0,0.0,1.0,672.0,3131.0,4406.0,0.0,4789.0,527.0
3479,ENSMUST00000239525.1,10470,3481,0.0,1.0,0.0,0.0,0.0,0.0,672.0,3131.0,4406.0,0.0,4789.0,527.0


# Stalling score

In [5]:
# slct_codon_table = pd.read_csv('/Data_2/Daehwa/Adipocyte/Analysis/Ribosome_stalling/v20230730/adi_codon-count.tsv', sep='\t')

stall_score = slct_codon_table[['transcript_id','reltostart-asite']].copy()
for SP in [f'{day}{rep}' for day in ['BG','BR','WH'] for rep in ['a','b']]:
    stall_score[SP] = slct_codon_table[SP+'_n'] / (slct_codon_table[SP+'_N'].replace(0, np.nan) / slct_codon_table['codon_num'])
stall_score = stall_score.replace(np.nan, 0)

codon_table = pd.DataFrame(columns=['transcript_id','asite','reltostart-asite','codon-asite','aa-asite'])
for SP in [day+rep for day in ['BG','BR','WH'] for rep in ['a','b']]:
    codon_data = pd.read_csv(f'/Data_2/Daehwa/Adipocyte/Other_papers/Martinez_etal.2023.Cell_Metabolism/Alignment/rpf/RPF/Martinez_{SP}.rep.codons.data.txt', sep='\t')
    codon_data = codon_data[['transcript_id','asite','reltostart-asite','codon-asite','aa-asite']]
    codon_table = pd.merge(codon_table, codon_data, on=['transcript_id','asite','reltostart-asite','codon-asite','aa-asite'], how='outer')

stall_score = pd.merge(codon_table, stall_score, on=['transcript_id','reltostart-asite'])

with open("Martinez_stall-score.df.pickle","wb") as fw:
    pickle.dump(stall_score, fw)
stall_score.to_csv('Martinez_stall-score.tsv', sep='\t', index=False)
display(stall_score)

Unnamed: 0,transcript_id,asite,reltostart-asite,codon-asite,aa-asite,BGa,BGb,BRa,BRb,WHa,WHb
0,ENSMUST00000000001.5,192,51,AUG,M,3.121495,0.000000,0.145344,0.000000,0.000000,0.000000
1,ENSMUST00000000001.5,198,57,GAC,D,3.121495,0.409314,0.000000,0.000000,0.000000,0.544861
2,ENSMUST00000000001.5,201,60,CGC,R,1.560748,2.455882,0.145344,0.000000,0.000000,1.634584
3,ENSMUST00000000001.5,207,66,UUG,L,9.364486,7.367647,5.087032,2.380048,7.275934,4.358891
4,ENSMUST00000000001.5,210,69,CGG,R,4.682243,1.227941,0.581375,0.793349,0.000000,1.634584
...,...,...,...,...,...,...,...,...,...,...,...
891064,ENSMUST00000239525.1,9683,9552,GCA,A,0.000000,0.000000,0.000000,0.000000,0.000000,6.605313
891065,ENSMUST00000239525.1,9779,9648,GAC,D,0.000000,0.000000,0.000000,0.000000,0.000000,6.605313
891066,ENSMUST00000239525.1,9836,9705,GCU,A,0.000000,0.000000,0.000000,0.000000,0.000000,6.605313
891067,ENSMUST00000239525.1,10160,10029,GAU,D,0.000000,0.000000,0.000000,0.000000,0.000000,6.605313
