In [2]:
import pandas as pd
import os

def split_sequence(protein_id, sequence, segment_length=48, overlap=24):
    """
    Splits a protein sequence into overlapping segments of a specified length.

    Args:
        protein_id (str): The ID of the protein. This will be part of the segment ID.
        sequence (str): The amino acid sequence of the protein.
        segment_length (int): The desired length for each segment (default: 48).
        overlap (int): The number of amino acids to overlap between segments (default: 24).

    Returns:
        list: A list of tuples, where each tuple is (segment_id, segment_sequence).
              segment_id format: "protein_id_start_AA_pos-end_AA_pos".
    """
    segments = []
    seq_len = len(sequence)
    step = segment_length - overlap

    # Case 1: If the sequence is shorter than the desired segment_length
    # Take the whole sequence as one segment.
    if seq_len < segment_length:
        segments.append((f"{protein_id}_1-{seq_len}", sequence))
        return segments

    # Case 2: Standard sliding window for full segments
    # The loop generates segments as long as a full segment_length chunk can be extracted
    # starting from position `i`.
    # `seq_len - segment_length + 1` ensures the last possible start index for a full segment is included.
    for i in range(0, seq_len - segment_length + 1, step):
        start_idx = i
        end_idx = i + segment_length
        segment_seq = sequence[start_idx:end_idx]
        segment_id = f"{protein_id}_{start_idx + 1}-{end_idx}" # 1-based indexing for ID
        segments.append((segment_id, segment_seq))

    # Case 3: Handle the "last 48 AA" segment
    # This rule applies if the sequence doesn't end neatly with the standard sliding window.
    # We want a segment that takes the last `segment_length` amino acids of the original sequence.
    # Its start index (0-based) would be `seq_len - segment_length`.
    last_segment_start_idx = seq_len - segment_length
    last_segment_end_idx = seq_len # Always ends at the sequence end

    # Check if this "last 48 AA" segment (based on its start index)
    # has already been generated by the `for` loop.
    # If the `segments` list is not empty and the start index of the last added segment
    # by the loop is different from `last_segment_start_idx`, then add this "last 48 AA" segment.
    if segments:
        # Extract the 1-based start index of the last segment added by the loop
        # The segment_id format is "protein_id_start_AA_pos-end_AA_pos".
        # We need the 'start_AA_pos' part.
        # Example: if protein_id is "NP_000005.3_1", then segment_id could be "NP_000005.3_1_1-48".
        # Splitting by '_' and taking the last part '1-48', then splitting by '-' and taking '1'.
        last_added_segment_1_based_start_str = segments[-1][0].split('_')[-1].split('-')[0]
        last_added_segment_1_based_start_idx = int(last_added_segment_1_based_start_str)

        # Compare with the desired 1-based start index for the "last 48 AA" segment
        desired_1_based_start_idx_for_last_48 = last_segment_start_idx + 1

        if last_added_segment_1_based_start_idx != desired_1_based_start_idx_for_last_48:
            segment_seq = sequence[last_segment_start_idx:last_segment_end_idx]
            segment_id = f"{protein_id}_{last_segment_start_idx + 1}-{last_segment_end_idx}"
            segments.append((segment_id, segment_seq))
    
    return segments

# --- Main script execution ---

# Define file paths
input_file = 'data/human-idr-regions.csv'
output_file = 'data/human-idr-regions-tiled.csv'

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Read the dataset
try:
    df = pd.read_csv(input_file)
    print(f"Successfully loaded '{input_file}' with {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: Input file '{input_file}' not found.")
    print("Please ensure the file is in the correct directory.")
    exit()
except Exception as e:
    print(f"An error occurred while reading the CSV: {e}")
    exit()

# List to store all new rows (segmented sequences with metadata)
tiled_data = []

# Define segment parameters
SEGMENT_LENGTH = 48
OVERLAP = 24

print(f"Processing sequences with segment length = {SEGMENT_LENGTH} AA and overlap = {OVERLAP} AA...")

df['original_Sequence'] = df['Sequence']

# Process each row in the input DataFrame
for index, row in df.iterrows():
    # Extract the original protein ID and sequence to be tiled
    # Using 'IDR_Unique_ID' to generate segment IDs, as it's already unique for each IDR region.
    original_idr_unique_id = row['IDR_Unique_ID']
    original_sequence = row['original_Sequence']

    # Get all other metadata columns from the current row
    # We'll create a dictionary for these to easily add them to each tiled segment.
    # The 'Sequence' column will be effectively replaced by 'Tiled_Sequence'.
    metadata = row.drop('original_Sequence').to_dict()

    # Split the sequence into segments
    segments = split_sequence(original_idr_unique_id, original_sequence, SEGMENT_LENGTH, OVERLAP)

    # For each segment, create a new row with the metadata and segment-specific info
    for segment_id, segment_sequence in segments:
        new_row_data = metadata.copy() # Start with all original metadata
        new_row_data['Tiled_ID'] = segment_id
        new_row_data['Sequence'] = segment_sequence
        tiled_data.append(new_row_data)

# Create a new DataFrame from the processed rows
tiled_df = pd.DataFrame(tiled_data)

# Reorder columns for better readability.
# Place 'Tiled_ID' and 'Tiled_Sequence' logically after the original IDR identifiers.
original_metadata_cols = [col for col in df.columns if col != 'original_Sequence']
desired_column_order = original_metadata_cols + ['Tiled_ID', 'Sequence']

# Ensure all desired columns exist in the final DataFrame (handles cases where metadata might be empty, though unlikely here)
final_columns = [col for col in desired_column_order if col in tiled_df.columns]
# Add any columns that might be in tiled_df but not in desired_column_order (e.g., if metadata changed)
for col in tiled_df.columns:
    if col not in final_columns:
        final_columns.append(col)

tiled_df = tiled_df[final_columns]

# Write the new DataFrame to a CSV file
tiled_df.to_csv(output_file, index=False)

print(f"Processing complete. Generated {len(tiled_df)} new rows (tiled segments).")
print(f"Tiled data saved to '{output_file}'")

Successfully loaded 'data/human-idr-regions.csv' with 263958 rows.
Processing sequences with segment length = 48 AA and overlap = 24 AA...
Processing complete. Generated 1431629 new rows (tiled segments).
Tiled data saved to 'data/human-idr-regions-tiled.csv'
