# IMPORT LIBRARIES

In [1]:
import os
import pandas as pd
os.chdir('/Users/alexascunceparis/Desktop/BSC/TCRranker_v3')
from extract_contacts import extract_contacts
from utils import parse_general_file

In [2]:
# Chain info dict:{'1ao7': {'tcra_chain': 'D',
#  'tcrb_chain': 'E',
#  'peptide_chain': 'C',
#  'mhc_chain': 'A'},
# '1bd2': {'tcra_chain': 'D',
#  'tcrb_chain': 'E'...}

chain_dict=parse_general_file('./structures_annotation/general.txt')

# EXTRACT CONTACTS

In [3]:
import os

# Loop through all files in the pdb_files directory
for pdb_file in os.listdir('./pdb_files/'):
    # Check if the file is a CSV file
    if pdb_file.endswith('.pdb'):
        pdb_id = os.path.basename(pdb_file).split('.')[0]  # Extract PDB ID from the file name
        pdb_path = f'./pdb_files/{pdb_file}'  # Path to the current PDB file
        
        # Create the output directory if it doesn't exist
        if not os.path.exists('./contact_maps'):
            os.makedirs('./contact_maps')
        
        # Define the output file path for the contact map
        output_file = f'./contact_maps/{pdb_id}_contacts.csv'
        
        # Check if the contact map already exists
        if os.path.exists(output_file):
            # If it exists, print a message and skip processing this file
            print(f"File {output_file} exists, omitting...")
            continue
        else:
            if pdb_id in chain_dict:
                contacts_df = extract_contacts([pdb_path], chain_dict)
                contacts_df.to_csv(output_file, index=False)  # Save the contact map to the output file
                print(f"Saved contacts in {output_file}.")
            else:
                print(f"Chain info not found for {pdb_id}. Using default chains...")
                chain_dict = {f'{pdb_id}': {'tcra_chain': 'D',
                                            'tcrb_chain': 'E',
                                            'peptide_chain': 'C',
                                            'b2m_chain': 'B',
                                            'mhc_chain': 'A'}}

                contacts_df = extract_contacts([pdb_path], chain_dict)
                contacts_df.to_csv(output_file, index=False)  # Save the contact map to the output file
                print(f"Saved contacts in {output_file}.")

File ./contact_maps/2ckb_contacts.csv exists, omitting...
File ./contact_maps/5m02_contacts.csv exists, omitting...
File ./contact_maps/8cx4_contacts.csv exists, omitting...
File ./contact_maps/5tez_contacts.csv exists, omitting...
File ./contact_maps/7n4k_contacts.csv exists, omitting...
File ./contact_maps/7rrg_contacts.csv exists, omitting...
File ./contact_maps/3vxs_contacts.csv exists, omitting...
File ./contact_maps/5nmf_contacts.csv exists, omitting...
File ./contact_maps/7jwi_contacts.csv exists, omitting...
File ./contact_maps/8i5d_contacts.csv exists, omitting...
File ./contact_maps/3mv9_contacts.csv exists, omitting...
File ./contact_maps/8rym_contacts.csv exists, omitting...
File ./contact_maps/3qdm_contacts.csv exists, omitting...
File ./contact_maps/3mv8_contacts.csv exists, omitting...
File ./contact_maps/8gon_contacts.csv exists, omitting...
File ./contact_maps/3d3v_contacts.csv exists, omitting...
File ./contact_maps/5nmg_contacts.csv exists, omitting...
File ./contact

#### SEE IF ALL CONTACT MAPS CAN BE USED
Ensures each contact map has 2 chain_from (TCRa or TCRb) and 2 chain_to (MHC or Peptide)

In [4]:
def validate_chain_columns(csv_file, chain_dict):
    """
    Verifies that the 'chain_from' and 'chain_to' columns in the CSV file contain exactly 2 unique strings each,
    and that these chains match the expected types (TCRα, TCRβ, peptide, MHC).
    
    Args:
    - csv_file (str): Path to the CSV file.
    - chain_dict (dict): Dictionary containing expected chain types for each PDB file.
    
    Returns:
    - bool: True if the condition is met, False otherwise.
    - dict: Dictionary containing the found strings in 'chain_from' and 'chain_to'.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Check if the 'chain_from' and 'chain_to' columns exist
    if 'chain_from' not in df.columns or 'chain_to' not in df.columns:
        raise ValueError(f"The file {csv_file} does not contain 'chain_from' or 'chain_to' columns.")
    
    # Get the pdb_id from the file name (assuming the file name is structured as <pdb_id>_contacts.csv)
    pdb_id = os.path.basename(csv_file).split('_')[0]

    # Check if the pdb_id exists in chain_dict
    if pdb_id not in chain_dict:
        chain_dict = {f'{pdb_id}': {'tcra_chain': 'D',
                                    'tcrb_chain': 'E',
                                    'peptide_chain': 'C',
                                    'b2m_chain': 'B',
                                    'mhc_chain': 'A'}}
        print(f"Chain info not found for {pdb_id}. Using default chains...")
    
    # Retrieve the expected chains for this pdb_id
    expected_chains = chain_dict.get(pdb_id)
    expected_chain_from = [expected_chains['tcra_chain'], expected_chains['tcrb_chain']]  # TCR chains
    expected_chain_to = [expected_chains['peptide_chain'], expected_chains['mhc_chain']]  # Peptide and MHC chains

    # Get unique values in the 'chain_from' and 'chain_to' columns
    unique_chain_from = df['chain_from'].unique()
    unique_chain_to = df['chain_to'].unique()

    # Validate that both columns have exactly 2 unique strings and that the chains are expected
    is_valid = (len(unique_chain_from) == 2 and len(unique_chain_to) == 2 and
                all(chain in expected_chain_from for chain in unique_chain_from) and
                all(chain in expected_chain_to for chain in unique_chain_to))

    # Return the result and the found chains
    return is_valid, {'chain_from': unique_chain_from, 'chain_to': unique_chain_to}

def check_contact_maps(directory, chain_dict):
    """
    Iterates through the contact_maps folder and checks each CSV file to ensure that 'chain_from' and 'chain_to' 
    have 2 unique strings each, and that these chains match the expected TCR-peptide-MHC mapping.
    
    Args:
    - directory (str): Directory containing the CSV files.
    - chain_dict (dict): Dictionary containing expected chain types for each PDB file.
    """
    # Verify that the directory exists
    if not os.path.isdir(directory):
        raise ValueError(f"The directory {directory} does not exist.")

    # List all files in the directory
    for file_name in os.listdir(directory):
        if file_name.endswith('_contacts.csv'):
            file_path = os.path.join(directory, file_name)
            try:
                is_valid, chains = validate_chain_columns(file_path, chain_dict)
                if is_valid:
                    print(f"File {file_name} is valid.")
                else:
                    print(f"File {file_name} is not valid. Chains found: {chains}")
            
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Check all CSV files in the 'contact_maps' folder
check_contact_maps('./contact_maps/', chain_dict)

File 7n2p_contacts.csv is valid.
File 8rlv_contacts.csv is valid.
File 4jfd_contacts.csv is valid.
File 7l1d_contacts.csv is valid.
File 2gj6_contacts.csv is valid.
File 4jry_contacts.csv is valid.
File 5wkf_contacts.csv is valid.
File 2ckb_contacts.csv is valid.
File 7jwj_contacts.csv is valid.
File 8gvb_contacts.csv is valid.
File 8ryn_contacts.csv is valid.
File 5ivx_contacts.csv is valid.
File 2ypl_contacts.csv is valid.
File 5isz_contacts.csv is valid.
File 7n4k_contacts.csv is valid.
File 6bj3_contacts.csv is valid.
File 1mwa_contacts.csv is valid.
File 5c0b_contacts.csv is valid.
File 5c08_contacts.csv is valid.
File 3e3q_contacts.csv is valid.
File 5wlg_contacts.csv is valid.
File 7n2o_contacts.csv is valid.
File 5jhd_contacts.csv is valid.
File 6vm7_contacts.csv is valid.
File 8v4z_contacts.csv is valid.
File 8ryq_contacts.csv is valid.
File 6zkz_contacts.csv is valid.
File 4ftv_contacts.csv is valid.
File 3rgv_contacts.csv is valid.
File 8ye4_contacts.csv is valid.
File 8gom_