In [1]:
import ribopy
from ribopy import Ribo
from functions import get_sequence, get_cds_range_lookup
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
### Initialize variables

## Adjust
target_name = 'WT_8-cell_3'
ribo_file = 'Rep_8'
exp= "WT_8cell_B8_1_ITP"
alias = False

## Files
ribo_path = f'/home/reiko/ribopy_analysis/files_analysis_c_elegans/ribo_files/{ribo_file}.ribo'
pickle_file_path = f'/home/reiko/ribopy_analysis/files_analysis_c_elegans/results/{target_name}_coverage.pkl'
bed_file = '/home/reiko/ribopy_analysis/files_analysis_c_elegans/celegans_reference/appris_celegans_v1_actual_regions_new.bed'
reference_path = '/home/reiko/ribopy_analysis/files_analysis_c_elegans/celegans_reference/appris_celegans_v1_selected_new.fa'

ribo_object = Ribo(ribo_path)
sequence = get_sequence(ribo_object, reference_path)
cds_range = get_cds_range_lookup(bed_file)

In [3]:
# Open the pickle file in binary read mode ('rb')
with open(pickle_file_path, 'rb') as f:
    coverage_dict = pickle.load(f)

Find codon occupancy

In [4]:
codon_occ = defaultdict(int)

for transcript, coverage in coverage_dict.items():
    if coverage is not None:
        start, stop = cds_range[transcript]
        cds_sequence = sequence[transcript][start: stop]

        # Iterate over the sequence in steps of 3 (to get each codon)
        for i in range(0, len(cds_sequence), 3):
            codon = cds_sequence[i: i + 3]
            count = sum(coverage[i: i + 3])  # Use coverage directly here
            # Increment the count for the codon using defaultdict
            codon_occ[codon] += count

# Sort codon_occ by keys alphabetically
sorted_codon_occ = {k: codon_occ[k] for k in sorted(codon_occ)}

# Convert to DataFrame
df_codon_occ = pd.DataFrame(list(sorted_codon_occ.items()), columns=['Codon', 'Count'])

In [None]:
# Save DataFrame as .csv file 
# df_codon_occ.to_csv(f'/home/reiko/ribopy_analysis/files_analysis_c_elegans/results/{target_name}_codon_occ.csv')

Normalize codon occupancy

In [None]:
df = pd.read_csv(f'/home/reiko/ribopy_analysis/files_analysis_c_elegans/results/codon_occupancy.csv')
df = df.drop(columns=['Unnamed: 0'])
df.set_index('Codon', inplace=True)

In [None]:
column_sums = df.sum()
normalized_df = df.div(column_sums)
# normalized_df.to_csv('/home/reiko/ribopy_analysis/files_analysis_c_elegans/results/codon_occupancy_norm.csv')

Find codon distribution

In [None]:
transcriptome_codon_dist = defaultdict(int)

for transcript, coverage in coverage_dict.items():
    if coverage is not None:
        start, stop = cds_range[transcript]
        cds_sequence = sequence[transcript][start: stop]

        for i in range(0, len(cds_sequence), 3):
            codon = cds_sequence[i: i + 3]
            transcriptome_codon_dist[codon] += 1

sorted_transcriptome_codon_dist = {k: transcriptome_codon_dist[k] for k in sorted(transcriptome_codon_dist)}
df_codon_dist = pd.DataFrame(list(sorted_transcriptome_codon_dist.items()), columns=['Codon', 'Count'])

In [None]:
# Save DataFrame as .csv file
df_codon_dist.to_csv('/home/reiko/ribopy_analysis/files_analysis_c_elegans/results/transcriptome_codon_distribution.csv')