In [None]:
import pandas as pd
import random
import os
import sys
from tqdm import tqdm

# --- Configuration ---
# Define the paths to the positive control files
POSITIVE_CONTROL_FILES = [
    "data/positive_controls/interacting-proteins-full-dataset_Lit-BM.csv", # Full protein pairs
    "data/positive_controls/interacting-proteins_tiles-to-full-protein.csv", # Tiles to Protein A
    "data/positive_controls/interacting-proteins_tiles-to-tiles.csv"    # Tiles to Tiles
]

# Output directories for negative controls
SHUFFLED_NEG_DIR = "data/negative_controls_shuffled"
RANDOM_NEG_DIR = "data/negative_controls_random"

# Column names for consistent access (must match positive control files)
ID_A_COL = "ID_A_Interactor"
SEQ_A_COL = "Sequence_A"
ID_B_COL = "ID_B_Interactor"
SEQ_B_COL = "Sequence_B"

# Amino acids for purely random sequence generation
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'

# --- Helper Functions ---

def _load_positive_control_df(filepath: str) -> pd.DataFrame:
    """Loads a positive control CSV file into a DataFrame."""
    try:
        df = pd.read_csv(filepath)
        # Basic check for expected columns
        required_cols = [ID_A_COL, SEQ_A_COL, ID_B_COL, SEQ_B_COL]
        if not all(col in df.columns for col in required_cols):
            print(f"Warning: File '{filepath}' is missing one or more expected columns: {required_cols}", file=sys.stderr)
            print("Attempting to proceed, but results might be unexpected.", file=sys.stderr)
        return df
    except FileNotFoundError:
        print(f"Error: Positive control file '{filepath}' not found.", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error loading positive control file '{filepath}': {e}", file=sys.stderr)
        sys.exit(1)

def _extract_all_unique_sequences_from_df(df: pd.DataFrame) -> list[tuple[str, str]]:
    """
    Extracts all unique (ID, Sequence) pairs from both A and B columns of a DataFrame.
    Filters out 'NOT_FOUND' sequences.
    """
    all_sequences = set() # Use a set to store unique (ID, Sequence) tuples
    
    # Process Protein A side
    for _, row in df.iterrows():
        seq_a = row.get(SEQ_A_COL) # Use .get() for safer access in case column is missing
        if seq_a and seq_a != "NOT_FOUND":
            all_sequences.add((row.get(ID_A_COL, "NA_ID_A"), seq_a)) # Provide a default ID if missing
            
    # Process Protein B side
    for _, row in df.iterrows():
        seq_b = row.get(SEQ_B_COL)
        if seq_b and seq_b != "NOT_FOUND":
            all_sequences.add((row.get(ID_B_COL, "NA_ID_B"), seq_b))
            
    return list(all_sequences)

# --- Negative Control Generation Functions ---

def generate_shuffled_pairs_negative_controls(
    positive_control_df: pd.DataFrame,
    output_filepath: str
) -> None:
    """
    Generates negative controls by randomly shuffling existing protein A and B sequences
    *from the given positive_control_df itself*.
    The number of negative control pairs matches the input positive control DataFrame.
    """
    print(f"\nGenerating shuffled pairs negative controls for {os.path.basename(output_filepath)}...")
    
    # Create the sequence pool specific to THIS positive control file
    file_specific_seq_pool = _extract_all_unique_sequences_from_df(positive_control_df)

    if not file_specific_seq_pool:
        print(f"Warning: Sequence pool for '{os.path.basename(output_filepath)}' is empty. Cannot generate shuffled controls.", file=sys.stderr)
        return

    print(f"  Collected {len(file_specific_seq_pool)} unique (ID, Sequence) pairs from this file for shuffling.")

    num_pairs = len(positive_control_df)
    negative_control_data = []

    for i in tqdm(range(num_pairs), desc="Shuffling sequences"):
        # Randomly pick two distinct (ID, Sequence) pairs from the file-specific pool
        id_seq_a = random.choice(file_specific_seq_pool)
        id_seq_b = random.choice(file_specific_seq_pool)

        negative_control_data.append({
            ID_A_COL: id_seq_a[0],
            SEQ_A_COL: id_seq_a[1],
            ID_B_COL: id_seq_b[0],
            SEQ_B_COL: id_seq_b[1]
        })

    df_neg_shuffled = pd.DataFrame(negative_control_data)
    df_neg_shuffled.to_csv(output_filepath, index=False)
    print(f"Saved shuffled negative controls to: {output_filepath}")


def generate_purely_random_sequences_negative_controls(
    positive_control_df: pd.DataFrame,
    output_filepath: str
) -> None:
    """
    Generates purely random protein sequences, maintaining a similar length distribution
    to the input positive control sequences.
    """
    print(f"\nGenerating purely random sequences negative controls for {os.path.basename(output_filepath)}...")

    all_lengths = []
    
    # Collect all sequence lengths from both A and B sides
    for _, row in positive_control_df.iterrows():
        seq_a = row[SEQ_A_COL]
        if seq_a and seq_a != "NOT_FOUND":
            all_lengths.append(len(seq_a))
        seq_b = row[SEQ_B_COL]
        if seq_b and seq_b != "NOT_FOUND":
            all_lengths.append(len(seq_b))

    if not all_lengths:
        print(f"Warning: No valid sequence lengths found in {os.path.basename(output_filepath)}. Cannot generate random sequences.", file=sys.stderr)
        return

    num_pairs = len(positive_control_df)
    negative_control_data = []

    for i in tqdm(range(num_pairs), desc="Generating random sequences"):
        # Sample two lengths from the collected distribution
        len_a = random.choice(all_lengths)
        len_b = random.choice(all_lengths)

        # Generate random sequences
        rand_seq_a = ''.join(random.choice(AMINO_ACIDS) for _ in range(len_a))
        rand_seq_b = ''.join(random.choice(AMINO_ACIDS) for _ in range(len_b))

        negative_control_data.append({
            ID_A_COL: f"RANDOM_A_{i+1:05d}", # Unique ID for random sequences
            SEQ_A_COL: rand_seq_a,
            ID_B_COL: f"RANDOM_B_{i+1:05d}", # Unique ID for random sequences
            SEQ_B_COL: rand_seq_b
        })

    df_neg_random = pd.DataFrame(negative_control_data)
    df_neg_random.to_csv(output_filepath, index=False)
    print(f"Saved purely random negative controls to: {output_filepath}")


# --- Main Execution ---
def main():
    # Ensure output directories exist
    os.makedirs(SHUFFLED_NEG_DIR, exist_ok=True)
    os.makedirs(RANDOM_NEG_DIR, exist_ok=True)

    print("--- Starting Negative Control Generation ---")

    # 1. Process each positive control file to generate negative controls
    for pc_file in POSITIVE_CONTROL_FILES:
        print(f"\n--- Processing Positive Control File: {pc_file} ---")
        df_pc = _load_positive_control_df(pc_file)
        
        # Determine output file names
        base_name = os.path.basename(pc_file).replace(".csv", "")
        shuffled_output_path = os.path.join(SHUFFLED_NEG_DIR, f"{base_name}_shuffled_neg.csv")
        random_output_path = os.path.join(RANDOM_NEG_DIR, f"{base_name}_random_neg.csv")

        # Generate shuffled pairs negative controls
        # The pool is now created INSIDE this function, specific to df_pc
        generate_shuffled_pairs_negative_controls(df_pc, shuffled_output_path)

        # Generate purely random sequences negative controls
        generate_purely_random_sequences_negative_controls(df_pc, random_output_path)

    print("\n--- Negative Control Generation Finished ---")

if __name__ == "__main__":
    main()

--- Starting Negative Control Generation ---

--- Processing Positive Control File: data/positive_controls/interacting-proteins-full-dataset_Lit-BM.csv ---

Generating shuffled pairs negative controls for interacting-proteins-full-dataset_Lit-BM_shuffled_neg.csv...
  Collected 6021 unique (ID, Sequence) pairs from this file for shuffling.


Shuffling sequences: 100%|██████████| 13441/13441 [00:00<00:00, 674059.50it/s]


Saved shuffled negative controls to: data/negative_controls_shuffled/interacting-proteins-full-dataset_Lit-BM_shuffled_neg.csv

Generating purely random sequences negative controls for interacting-proteins-full-dataset_Lit-BM_random_neg.csv...


Generating random sequences: 100%|██████████| 13441/13441 [00:06<00:00, 2130.46it/s]


Saved purely random negative controls to: data/negative_controls_random/interacting-proteins-full-dataset_Lit-BM_random_neg.csv

--- Processing Positive Control File: data/positive_controls/interacting-proteins_tiles-to-full-protein.csv ---

Generating shuffled pairs negative controls for interacting-proteins_tiles-to-full-protein_shuffled_neg.csv...
  Collected 165514 unique (ID, Sequence) pairs from this file for shuffling.


Shuffling sequences: 100%|██████████| 781282/781282 [00:01<00:00, 652759.10it/s]


Saved shuffled negative controls to: data/negative_controls_shuffled/interacting-proteins_tiles-to-full-protein_shuffled_neg.csv

Generating purely random sequences negative controls for interacting-proteins_tiles-to-full-protein_random_neg.csv...


Generating random sequences: 100%|██████████| 781282/781282 [04:20<00:00, 2993.42it/s]


Saved purely random negative controls to: data/negative_controls_random/interacting-proteins_tiles-to-full-protein_random_neg.csv

--- Processing Positive Control File: data/positive_controls/interacting-proteins_tiles-to-tiles.csv ---

Generating shuffled pairs negative controls for interacting-proteins_tiles-to-tiles_shuffled_neg.csv...
