# Basecalled modifications analysis using modkit for tRNAs

This notebooks aims to analyse the modification prediction made by Dorado basecalling. This is done by using the output pileup.bed table from modkit pileup.
Content of this notebook:
1. Import of required python packages
2. Data loading
3. Filtering of modifications
4. Visualisation

## 1. Import of required python packages

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib_venn import venn3
from matplotlib import ticker
import seaborn as sns
import numpy as np

## 2. Data loading

### Constants

Define the reference sequences

In [None]:
seq_Glu_CTC_1_1 = "TCCCTGGTGGTCTAGTGGTTAGGATTCGGCGCTCTCACCGCCGCGGCCCGGGTTCGATTCCCGGTCAGGGAA"
seq_Gly_GCC_2_1 = "GCATTGGTGGTTCAGTGGTAGAATTCTCGCCTGCCACGCGGGAGGCCCGGGTTCGATTCCCGGCCAATGCA"
seq_Leu_CAA_1_1_WI = "GTCAGGATGGCCGAGTGGTCTAAGGCGCCAGACTCAAGTTCTGGTCTCCAATGGAGGCGTGGGTTCGAATCCCACTTCTGACA"
seq_Leu_CAA_1_1 = "GTCAGGATGGCCGAGTGGTCTAAGGCGCCAGACTCAAGCTAAGCTTCCTCCGCGGTGGGGATTCTGGTCTCCAATGGAGGCGTGGGTTCGAATCCCACTTCTGACA"
seq_Leu_CAA_1_2 = "GTCAGGATGGCCGAGTGGTCTAAGGCGCCAGACTCAAGCTTGGCTTCCTCGTGTTGAGGATTCTGGTCTCCAATGGAGGCGTGGGTTCGAATCCCACTTCTGACA"

Define the reference contigs of itnerest

In [None]:
ref_reg_Leu = 'Homo_sapiens_tRNA-Leu-CAA-1-1-WI'
ref_reg_Glu = 'Homo_sapiens_tRNA-Glu-CTC-1-1'
ref_reg_Gly = 'Homo_sapiens_tRNA-Gly-GCC-2-1'

Make a dictionary with the modification code as key and the target base and modification name as value

In [None]:
modification_code = {'17802': ['T', 'pseU'],
                    '19227': ['T', '2OmeU'],
                     '19229': ['G', '2OmeG'],
                     'm': ['C', 'm5C'],
                     '19228': ['C', '2OmeC'],
                     'a': ['A', 'm6A'],
                     '17596': ['A', 'inosine'],
                     '69426': ['A', '2OmeA']
                    }
for code in modification_code:
    print(f"The modification code {code} modifies base {modification_code[code][0]}: {modification_code[code][1]}")

### Load the pileup file created by modkit

In [None]:
bed_df = pd.read_csv("path/to/data/modkit_pileup.bed", sep="\t", header=None)
bed_df.columns = ['chrom', 'start_position', 'end_position',
                 'modified_base_code', 'score', 'strand', 
                 'start_position_2', 'end_position_2', 'color',
                 'Nvalid_cov', 'percent_modified', 'Nmod',
                 'Ncanonical', 'Nother_mod', 'Ndelete', 'Nfail',
                 'Ndiff', 'Nnocal']

# Drop the following column, because irrelevant for downstream analysis
bed_df = bed_df.drop(columns = ['strand', 'end_position','color', 'start_position_2', 'end_position_2', 'score'])

bed_df

## 3. Filtering of modifications

### Percentage of canonical reads
Add a column which indicates the percentage of reads unmodified (canonical) over all the reads

In [None]:
bed_df = bed_df.copy()
bed_df['percent_canonical'] = (bed_df['Ncanonical'] / bed_df['Nvalid_cov']) * 100
print(f"The dataframe has {bed_df.shape[0]} rows and {bed_df.shape[1]} columns")

### Make a subset for the reference of interest
Select only the modification prediction within the reference of interest

In [None]:
sequence = 
print(len(sequence))
ref = 
print(ref)

In [None]:
bed_df_subset = bed_df[bed_df['chrom']== ref]
print(f"The subset dataframe has {bed_df_subset.shape[0]} rows and {bed_df_subset.shape[1]} columns")

### Filter the positions
Consider only the prediction when less than 90% of the reads are canonical (i.e. unmodified), and for which the total coverage is more than 35, and the number of modified reads is more than 5.

In [None]:
filtered_bed_df = bed_df_subset[(bed_df_subset['percent_canonical'] < 90) 
                        & (bed_df_subset['Nvalid_cov'] > 35)
                        & (bed_df_subset['Nmod'] > 5)]
print(f"The filtered subset dataframe has {filtered_bed_df.shape[0]} rows and {filtered_bed_df.shape[1]} columns")
print(f"The number of unique positions is {filtered_bed_df['start_position'].nunique()}")

For the IVT sample predictions are only considered when less than 90% of the reads are canonical and for which the total coverage is more than 200 reads:

In [None]:
# For IVT sample
filtered_bed_df = bed_df[(bed_df['percent_canonical'] < 90) & (bed_df['Nvalid_cov'] > 200)]

### Making new dataframe summarizing the results

In [None]:
reference_base_list = []
modified_base_list = []
modification_list = []
for idx, row in filtered_bed_df.iterrows():
    print(f"Modification {row['modified_base_code']}, which correspond to {modification_code[row['modified_base_code']]}")
    print(f"with start position {row['start_position']}")
    print(f"Within the reference sequence {sequence[row['start_position']]}\n")
    reference_base_list.append(sequence[row['start_position']])
    modified_base_list.append(modification_code[row['modified_base_code']][0])
    modification_list.append(modification_code[row['modified_base_code']][1])

In [None]:
result_df = pd.DataFrame({'chrom': filtered_bed_df['chrom'],
              'start_position': filtered_bed_df['start_position'],
              'modified_base_code':  filtered_bed_df['modified_base_code'],
              'modification': modification_list,
              'percent_modified': filtered_bed_df['percent_modified'],
                'percent_canonical': filtered_bed_df['percent_canonical'],
              'Nmod': filtered_bed_df['Nmod'],
            'Nvalid_cov': filtered_bed_df['Nvalid_cov'],
                'reference_base': reference_base_list,
                'modified_base': modified_base_list
             })
result_df

### The resulting list of predicted modified positions

In [None]:
list_positions = list(result_df['start_position'])
print(f"The number of positions basecalled to be modified: {len(sorted(set(list_positions)))}")
print(f"The positions basecalled to be modified:{sorted(set(list_positions))}")

### Export these results

In [None]:
result_df.to_csv("path/to/data/modkit_results.csv", index=False)

## 4. Visualisation

In [None]:
# Prepare Data
## Convert result_df into a DataFrame with percentage strings
data = [
    [
        row.start_position,
        row.modification,
        f"{row.percent_modified:.1f}%",  # Format float as percentage string
        row.Nmod
    ]
    for row in result_df.itertuples(index=False)
]

df = pd.DataFrame(data, columns=['Position', 'Modification', 'Percentage', 'Absolute'])

## Convert 'Percentage' column to float values (strip the % symbol)
df['Percentage'] = df['Percentage'].str.rstrip('%').astype(float)

## Make Position categorical (string) for ordered plotting
df['Position_str'] = df['Position'].astype(str)


# Define plotting parameters
## Position list for X-axis
positions = sorted(df['Position'].unique())

## Modification order and color mapping
mod_order = ['m6A', '2OmeC', '2OmeU', 'Inosine', 'm5C', 'pseU', '2OmeG', '2OmeA']
mod_colors = {
    'm6A':    '#1f77b4',  # Blue
    'pseU':   '#ff7f0e',  # Orange
    '2OmeU':  '#2ca02c',  # Green
    'm5C':    '#d62728',  # Red
    '2OmeC':  '#9467bd',  # Purple
    '2OmeG':  '#8c564b',  # Brown
    '2OmeA':  '#e377c2',  # Pink
}

num_mods = len(mod_order)

# Bar group width and spacing
bar_width = 2.0 / num_mods  # Width allocated to each modification
spacing = 2.5               # Space between position groups
x_pos = np.arange(len(positions)) * spacing


# Plot barplot
plt.figure(figsize=(13, 3))

for i, mod in enumerate(mod_order):
    mod_vals = []
    abs_vals = []

    # Collect percentage & absolute values for each position
    for pos in positions:
        # Get percentage for position/modification, or 0 if missing
        val = df.loc[(df['Position'] == pos) & (df['Modification'] == mod), 'Percentage']
        mod_vals.append(val.values[0] if not val.empty else 0)

        # Get absolute counts similarly
        abs_val = df.loc[(df['Position'] == pos) & (df['Modification'] == mod), 'Absolute']
        abs_vals.append(abs_val.values[0] if not abs_val.empty else 0)

    # Draw bars for this modification
    bars = plt.bar(
        x_pos + i * bar_width,
        mod_vals,
        color=mod_colors.get(mod, 'gray'),
        width=bar_width,
        label=mod,
        zorder=3
    )

    # Add absolute value labels above bars
    for bar, abs_val in zip(bars, abs_vals):
        height = bar.get_height()
        if height > 0:  # Only label non-zero bars
            plt.text(
                bar.get_x() + bar.get_width() / 2,
                height + 1,
                f"({abs_val})",
                ha='center',
                va='bottom',
                fontsize=9
            )

plt.ylabel('Modified reads (%)', fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Base position', fontsize=12)
plt.ylim(0, 100)
plt.xlim(x_pos[0] - bar_width, x_pos[-1] + bar_width * (num_mods + 0.5))

ax.grid(True, linestyle='--', alpha=0.4, axis='y', zorder=0)

# Emphasize specific positions
highlight_positions = []  # Positions to bold in X-axis labels

def custom_tick_label(pos):
    """Return custom X-axis label, bold if highlighted."""
    label = str(positions[int(round((pos / spacing)))]) if (pos / spacing).is_integer() else ""
    if label:
        label_int = int(label)
        if label_int in highlight_positions:
            return rf"$\bf{{{label}}}$"
        else:
            return label
    return ""

ax = plt.gca()
ax.set_xticks(x_pos + bar_width * (num_mods - 1) / 2)
ax.set_xticklabels([custom_tick_label(pos) for pos in x_pos])
for label in ax.get_xticklabels():
    label.set_fontsize(12)



# Make excluded positions shaded
highlight_excluded_positions = []  # Background highlights
highlight_color = 'lightgray'

for pos in highlight_excluded_positions:
    if pos in positions:
        index = positions.index(pos)
        x = x_pos[index]
        total_width = bar_width * num_mods
        plt.axvspan(x, x + total_width, color=highlight_color, alpha=0.6)



# Legend
handles, labels = plt.gca().get_legend_handles_labels()
ordered = [(h, l) for l in mod_order for h, lbl in zip(handles, labels) if lbl == l]
handles, labels = zip(*ordered)
# Uncomment to show legend
# plt.legend(handles, labels, fontsize=12, loc='upper left',
#            bbox_to_anchor=(0.5, -0.1),
#            ncol=1, framealpha=1.0, title='Base modification', title_fontsize=12)


plt.show()
