In [9]:
import os
os.chdir("/Users/alexascunceparis/Desktop/BSC/immuno_project/TCRranker")

from find_contact_map import *
from mapping import *
from select_nr_set import *
from extract_contacts import *

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def add_tcr_to_dataframe(df, alpha_seq, beta_seq, tcr_name):
    """
    Adds TCR information to the DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to which TCR information will be added.
    - alpha_seq (str): The TCR alpha chain sequence.
    - beta_seq (str): The TCR beta chain sequence.
    - tcr_name (str): Identifier for the input TCR.
    
    Returns:
    - pd.DataFrame: Updated DataFrame with new TCR information added.
    """
    # Generate the new pdb_id
    pdb_id = f"{tcr_name}"

    # Alpha chain
    anarci_output_alpha = run_anarci(alpha_seq, "D")
    cdr3_alpha, _ = parse_CDR3(anarci_output_alpha)
    v_gene_alpha, j_gene_alpha = get_germlines(alpha_seq)
    
    # Beta chain
    anarci_output_beta = run_anarci(beta_seq, "E")
    cdr3_beta, _ = parse_CDR3(anarci_output_beta)
    v_gene_beta, j_gene_beta = get_germlines(beta_seq)
    
    # New row as DataFrame
    new_row = pd.DataFrame({
        'pdb_id': [pdb_id],
        'cdr3_a_aa': [cdr3_alpha],
        'v_a_gene': [v_gene_alpha],
        'j_a_gene': [j_gene_alpha],
        'cdr3_b_aa': [cdr3_beta],
        'v_b_gene': [v_gene_beta],
        'j_b_gene': [j_gene_beta],
        'count': [1]
    })

    # Add the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)
       
    return df


def find_closest_tcr(df, alpha_seq, beta_seq, tcr_name):
    """
    Finds the closest TCR to the given sequences.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing existing TCR information.
    - alpha_seq (str): The TCR alpha chain sequence of the new TCR.
    - beta_seq (str): The TCR beta chain sequence of the new TCR.
    - tcr_name (str): The TCR name for the new entry.

    Returns:
    - str: The pdb_id of the closest TCR, ensuring `tcr_name` and `pdb_id` don't match.
    """
    # Add the new TCR to the DataFrame
    df = add_tcr_to_dataframe(df, alpha_seq, beta_seq, tcr_name)

    # Extract the last row as a DataFrame (which is the newly added TCR)
    last_row = df.iloc[[-1]]

    # List to store the global distances
    results = []

    # Construct the path to the database file
    dir_path = os.getcwd()
    db_file_path = os.path.join(dir_path, 'TCRdist', 'alphabeta_gammadelta_db.tsv')

    # Iterate over each row of the DataFrame except the last one
    for end_row in range(len(df) - 1):
        current_row = df.iloc[[end_row]]  # Current row as DataFrame
        
        # Check if the current pdb_id matches the new TCR name
        current_pdb_id = current_row['pdb_id'].values[0]  # Assuming 'pdb_id' is the column name for pdb IDs
        if current_pdb_id == tcr_name:
            continue  # Skip if the current pdb_id matches the new TCR name

        # Create TCRrep for the current row
        tr_current = TCRrep(cell_df=current_row,
                            organism='human', 
                            chains=['alpha', 'beta'], 
                            compute_distances=False,
                            db_file=db_file_path)

        # Create TCRrep for the last row (new TCR)
        tr_last_row = TCRrep(cell_df=last_row, 
                             organism='human', 
                             chains=['alpha', 'beta'], 
                             compute_distances=False,
                             db_file=db_file_path)

        # Compute distances between the two TCRs
        tr_current.compute_rect_distances(df=tr_last_row.clone_df, df2=tr_current.clone_df)

        # Sum the alpha and beta chain distances to get global distance
        global_distances = [tr_current.rw_alpha[0][i] + tr_current.rw_beta[0][i] for i in range(len(tr_current.rw_alpha[0]))]
        
        # Append the global distance for this row
        results.append(global_distances)

    # Check if we have valid results
    if not results:
        raise ValueError("No valid TCRs found for comparison.")

    # Flatten the results
    flattened_results = [item[0] for item in results]

    # Find the minimum global distance
    min_value = min(flattened_results)

    # Get all indices where the value is equal to the minimum value
    min_indices = [index for index, value in enumerate(flattened_results) if value == min_value]

    # Collect PDB IDs for the minimum distances
    pdb_ids_with_min_distance = [df.iloc[index]['pdb_id'] for index in min_indices]

    # Return the first matching pdb_id or handle ties
    return pdb_ids_with_min_distance[0] if len(pdb_ids_with_min_distance) == 1 else pdb_ids_with_min_distance

In [None]:
import os
import csv

# Path to the pdb_nr folder
pdb_folder = "./pdb_nr/"

# Open the CSV file for writing
with open('pdb_sequences.csv', mode='w', newline='') as csv_file:
    # Create a CSV writer
    csv_writer = csv.writer(csv_file)
    
    # Write the header row
    csv_writer.writerow(['tcr_id', 'alpha_seq', 'beta_seq'])

    # Loop over each PDB file in the directory
    for pdb_file in os.listdir(pdb_folder):
        if pdb_file.endswith(".pdb"):
            # Construct the full file path
            pdb_file_path = os.path.join(pdb_folder, pdb_file)
            
            # Extract the PDB ID from the file name
            pdb_id = pdb_file.split(".")[0]
            
            # Extract the sequences (adjust this function as per your actual logic)
            alpha_seq, beta_seq, epitope = extract_specific_sequences(pdb_file_path, seq_dict)
            
            # Write the PDB ID, alpha sequence, and beta sequence to the CSV file
            csv_writer.writerow([pdb_id, alpha_seq, beta_seq])

print("CSV file written successfully!")

In [22]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

dataframe=pd.read_csv("./structures_annotation/TCRdist_df.csv")
common_df=pd.read_csv("tcr_common.csv")

# Open the file once outside the loop
with open("closest_tcr.csv", "a") as f:
    # Write headers if the file is empty
    if os.stat("closest_tcr.csv").st_size == 0:
        f.write("tcr_name,closest_tcr\n")
    
    # Iterate through rows of the DataFrame
    for _, row_data in common_df.iterrows():
        alpha_seq = row_data['alpha_seq']
        beta_seq = row_data['beta_seq']
        tcr_name = row_data['tcr_id']
        
        # Find the closest TCR
        closest_tcr = find_closest_tcr(dataframe, alpha_seq, beta_seq, tcr_name)
        
        # Write the result to the CSV
        f.write(f"{tcr_name},{closest_tcr}\n")
        
        # Optionally print the result for confirmation
        print(f"Closest TCR for {tcr_name}: {closest_tcr}")

Closest TCR for 22560: 7n6e
Closest TCR for 33315: ['3qdm', '3qeq']
Closest TCR for 30403: 8i5c
Closest TCR for 32798: ['7pdw', '7pbc']
Closest TCR for 32596: ['8gvb', '8gvi', '8gvg']
Closest TCR for 31902: 6avg
Closest TCR for 32385: ['8eo8', '8enh', '8en8']
Closest TCR for 31744: 8i5c
Closest TCR for 31901: 6avg
Closest TCR for 31741: ['5jzi', '5yxn', '5yxu']
Closest TCR for 31685: 4mji
Closest TCR for 21392: ['7nmg', '7nme', '7nmf']
Closest TCR for 32442: 4mji
Closest TCR for 32732: 7pbe
Closest TCR for 33316: 5eu6
Closest TCR for 33317: ['7rtr', '7n1f']
Closest TCR for 32435: ['3kxf', '2ak4']
Closest TCR for 31937: 3vxm
Closest TCR for 32355: 6rsy
Closest TCR for 31772: ['7dzn', '7dzm']
Closest TCR for 31881: 4mji
Closest TCR for 30993: 5tez
Closest TCR for 32015: ['4ms8', '4mvb', '3tfk', '4n0c', '4n5e', '3tpu', '4mxq', '6dkp']
Closest TCR for 30112: 7pbe


In [None]:
import warnings
warnings.filterwarnings("ignore")

dataframe=pd.read_csv("./structures_annotation/TCRdist_df.csv")
pdb_common=pd.read_csv("tcr_common.csv")

# Open the file once outside the loop
with open("closest_tcr.csv", "a") as f:
    # Write headers if the file is empty
    if os.stat("closest_tcr.csv").st_size == 0:
        f.write("tcr_name,closest_tcr\n")
    
    # Iterate through rows of the DataFrame
    for _, row_data in pdb_common.iterrows():
        alpha_seq = row_data['alpha_seq']
        beta_seq = row_data['beta_seq']
        tcr_name = row_data['tcr_id']
        
        # Find the closest TCR
        closest_tcr = find_closest_tcr(dataframe, alpha_seq, beta_seq, tcr_name)
        
        # Write the result to the CSV
        f.write(f"{tcr_name},{closest_tcr}\n")
        
        # Optionally print the result for confirmation
        print(f"Closest TCR for {tcr_name}: {closest_tcr}")