In [1]:
import pandas as pd
import os

def split_sequence(protein_id, sequence, segment_length=48, overlap=24):
    """
    Splits a protein sequence into overlapping segments of a specified length.

    Args:
        protein_id (str): The ID of the protein. This will be part of the segment ID.
        sequence (str): The amino acid sequence of the protein.
        segment_length (int): The desired length for each segment (default: 48).
        overlap (int): The number of amino acids to overlap between segments (default: 24).

    Returns:
        list: A list of tuples, where each tuple is (segment_id, segment_sequence).
              segment_id format: "protein_id_start_AA_pos-end_AA_pos".
    """
    segments = []
    seq_len = len(sequence)
    step = segment_length - overlap

    # Case 1: If the sequence is shorter than the desired segment_length
    # Take the whole sequence as one segment.
    if seq_len < segment_length:
        segments.append((f"{protein_id}_1-{seq_len}", sequence))
        return segments

    # Case 2: Standard sliding window for full segments
    # The loop generates segments as long as a full segment_length chunk can be extracted
    # starting from position `i`.
    # `seq_len - segment_length + 1` ensures the last possible start index for a full segment is included.
    for i in range(0, seq_len - segment_length + 1, step):
        start_idx = i
        end_idx = i + segment_length
        segment_seq = sequence[start_idx:end_idx]
        segment_id = f"{protein_id}_{start_idx + 1}-{end_idx}" # 1-based indexing for ID
        segments.append((segment_id, segment_seq))

    # Case 3: Handle the "last 48 AA" segment
    # This rule applies if the sequence doesn't end neatly with the standard sliding window.
    # We want a segment that takes the last `segment_length` amino acids of the original sequence.
    # Its start index (0-based) would be `seq_len - segment_length`.
    last_segment_start_idx = seq_len - segment_length
    last_segment_end_idx = seq_len # Always ends at the sequence end

    # Check if this "last 48 AA" segment (based on its start index)
    # has already been generated by the `for` loop.
    # If the `segments` list is not empty and the start index of the last added segment
    # by the loop is different from `last_segment_start_idx`, then add this "last 48 AA" segment.
    if segments:
        # Extract the 1-based start index of the last segment added by the loop
        last_added_segment_1_based_start_str = segments[-1][0].split('_')[-1].split('-')[0]
        last_added_segment_1_based_start_idx = int(last_added_segment_1_based_start_str)

        # Compare with the desired 1-based start index for the "last 48 AA" segment
        desired_1_based_start_idx_for_last_48 = last_segment_start_idx + 1

        if last_added_segment_1_based_start_idx != desired_1_based_start_idx_for_last_48:
            segment_seq = sequence[last_segment_start_idx:last_segment_end_idx]
            segment_id = f"{protein_id}_{last_segment_start_idx + 1}-{last_segment_end_idx}"
            segments.append((segment_id, segment_seq))
    
    # Note: If `seq_len >= segment_length` but `segments` is empty, it means `seq_len - segment_length + 1`
    # was <= 0 or `step` was too large. This shouldn't happen with segment_length=48 and step=24.
    # The loop `range(0, seq_len - segment_length + 1, step)` ensures at least one segment is added
    # if `seq_len >= segment_length`. For example, if seq_len=48, loop range(0,1,24) gives i=0.

    return segments

# --- Main script execution ---

# Define file paths
input_file = 'data/interacting-proteins-full-dataset_Lit-BM.csv'
output_file = 'data/interacting-proteins_tiles-to-full-protein.csv'
fully_tiled_output_file = 'data/interacting-proteins_tiles-to-tiles.csv'

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Read the dataset
try:
    df = pd.read_csv(input_file)
    print(f"Successfully loaded '{input_file}' with {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: Input file '{input_file}' not found.")
    print("Please ensure the file is in the correct directory.")
    exit()
except Exception as e:
    print(f"An error occurred while reading the CSV: {e}")
    exit()

# List to store all new rows (segmented pairs)
new_rows = []
new_rows_fully_tiled = []

# Define segment parameters
SEGMENT_LENGTH = 48
OVERLAP = 24

print(f"Processing sequences with segment length = {SEGMENT_LENGTH} AA and overlap = {OVERLAP} AA...")

# Process each original protein pair
for index, row in df.iterrows():
    id_a = row['ID_A_Interactor']
    id_b = row['ID_B_Interactor']
    seq_a = row['Sequence_A']
    seq_b = row['Sequence_B']

    # Scenario 1: Split Protein A into segments, keep Protein B full
    segments_a = split_sequence(id_a, seq_a, SEGMENT_LENGTH, OVERLAP)
    for seg_id_a, seg_seq_a in segments_a:
        new_rows.append({
            'ID_A_Interactor': seg_id_a,
            'ID_B_Interactor': id_b,
            'Sequence_A': seg_seq_a,
            'Sequence_B': seq_b
        })

    # Scenario 2: Split Protein B into segments, keep Protein A full
    segments_b = split_sequence(id_b, seq_b, SEGMENT_LENGTH, OVERLAP)
    for seg_id_b, seg_seq_b in segments_b:
        new_rows.append({
            'ID_A_Interactor': id_a,
            'ID_B_Interactor': seg_id_b,
            'Sequence_A': seq_a,
            'Sequence_B': seg_seq_b
        })

    # Scenario 3: Split Protein A and B and put each tile to each tile
    for seg_id_a, seg_seq_a in segments_a:
        for seg_id_b, seg_seq_b in segments_b:
            new_rows_fully_tiled.append({
                'ID_A_Interactor': seg_id_a,
                'ID_B_Interactor': seg_id_b,
                'Sequence_A': seg_seq_a,
                'Sequence_B': seg_seq_b
            })

# Create a new DataFrame from the processed rows
output_df = pd.DataFrame(new_rows)
fully_tiled_output_df = pd.DataFrame(new_rows_fully_tiled)

# Write the new DataFrame to a CSV file
output_df.to_csv(output_file, index=False)
fully_tiled_output_df.to_csv(fully_tiled_output_file, index=False)

print(f"Processing complete. Generated {len(output_df)} new rows.")
print(f"Segmented data saved to '{output_file}'")
print(f"Processing complete. Generated {len(fully_tiled_output_df)} new rows.")
print(f"Fully segmented data saved to '{fully_tiled_output_file}'")

Successfully loaded 'data/interacting-proteins-full-dataset_Lit-BM.csv' with 13441 rows.
Processing sequences with segment length = 48 AA and overlap = 24 AA...
Processing complete. Generated 781282 new rows.
Segmented data saved to 'data/interacting-proteins_tiles-to-full-protein.csv'
Processing complete. Generated 15006594 new rows.
Fully segmented data saved to 'data/interacting-proteins_tiles-to-tiles.csv'
