In [2]:
import os
import pandas as pd
os.chdir("/Users/alexascunceparis/Desktop/BSC/immuno_project/TCRranker")

from find_contact_map import *
from mapping import *
from select_nr_set import *
from extract_contacts import *

seq_dict=parse_general_file('./structures_annotation/general.txt')

In [3]:
def add_tcr_to_dataframe(df, alpha_seq, beta_seq, tcr_name):
    """
    Adds TCR information to the DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to which TCR information will be added.
    - alpha_seq (str): The TCR alpha chain sequence.
    - beta_seq (str): The TCR beta chain sequence.
    - tcr_name (str): Identifier for the input TCR.
    
    Returns:
    - pd.DataFrame: Updated DataFrame with new TCR information added.
    """
    # Generate the new pdb_id
    pdb_id = f"{tcr_name}"

    # Alpha chain
    anarci_output_alpha = run_anarci(alpha_seq, "D")
    cdr3_alpha, _ = parse_CDR3(anarci_output_alpha)
    v_gene_alpha, j_gene_alpha = get_germlines(alpha_seq)
    
    # Beta chain
    anarci_output_beta = run_anarci(beta_seq, "E")
    cdr3_beta, _ = parse_CDR3(anarci_output_beta)
    v_gene_beta, j_gene_beta = get_germlines(beta_seq)
    
    # New row as DataFrame
    new_row = pd.DataFrame({
        'pdb_id': [pdb_id],
        'cdr3_a_aa': [cdr3_alpha],
        'v_a_gene': [v_gene_alpha],
        'j_a_gene': [j_gene_alpha],
        'cdr3_b_aa': [cdr3_beta],
        'v_b_gene': [v_gene_beta],
        'j_b_gene': [j_gene_beta],
        'count': [1]
    })

    # Add the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)
       
    return df


def find_closest_tcr(df, alpha_seq, beta_seq, tcr_name):
    """
    Finds the closest TCR to the given sequences.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing existing TCR information.
    - alpha_seq (str): The TCR alpha chain sequence of the new TCR.
    - beta_seq (str): The TCR beta chain sequence of the new TCR.
    - tcr_name (str): The TCR name for the new entry.

    Returns:
    - str: The pdb_id of the closest TCR, ensuring `tcr_name` and `pdb_id` don't match.
    """
    # Add the new TCR to the DataFrame
    df = add_tcr_to_dataframe(df, alpha_seq, beta_seq, tcr_name)

    # Extract the last row as a DataFrame (which is the newly added TCR)
    last_row = df.iloc[[-1]]

    # List to store the global distances
    results = []

    # Construct the path to the database file
    dir_path = os.getcwd()
    db_file_path = os.path.join(dir_path, 'structures_annotation', 'alphabeta_gammadelta_db.tsv')

    # Iterate over each row of the DataFrame except the last one
    for end_row in range(len(df) - 1):
        current_row = df.iloc[[end_row]]  # Current row as DataFrame
        
        # Check if the current pdb_id matches the new TCR name
        current_pdb_id = current_row['pdb_id'].values[0]  # Assuming 'pdb_id' is the column name for pdb IDs
        if current_pdb_id == tcr_name:
            continue  # Skip if the current pdb_id matches the new TCR name

        # Create TCRrep for the current row
        tr_current = TCRrep(cell_df=current_row,
                            organism='human', 
                            chains=['alpha', 'beta'], 
                            compute_distances=False,
                            db_file=db_file_path)

        # Create TCRrep for the last row (new TCR)
        tr_last_row = TCRrep(cell_df=last_row, 
                             organism='human', 
                             chains=['alpha', 'beta'], 
                             compute_distances=False,
                             db_file=db_file_path)

        # Compute distances between the two TCRs
        tr_current.compute_rect_distances(df=tr_last_row.clone_df, df2=tr_current.clone_df)

        # Sum the alpha and beta chain distances to get global distance
        global_distances = [tr_current.rw_alpha[0][i] + tr_current.rw_beta[0][i] for i in range(len(tr_current.rw_alpha[0]))]
        
        # Append the global distance for this row
        results.append(global_distances)

    # Check if we have valid results
    if not results:
        raise ValueError("No valid TCRs found for comparison.")

    # Flatten the results
    flattened_results = [item[0] for item in results]

    # Find the minimum global distance
    min_value = min(flattened_results)

    # Get all indices where the value is equal to the minimum value
    min_indices = [index for index, value in enumerate(flattened_results) if value == min_value]

    # Collect PDB IDs for the minimum distances
    pdb_ids_with_min_distance = [df.iloc[index]['pdb_id'] for index in min_indices]

    # Return the first matching pdb_id or handle ties
    return pdb_ids_with_min_distance[0] if len(pdb_ids_with_min_distance) == 1 else pdb_ids_with_min_distance

# Extract TCRs from PDB_files

In [None]:
import os
import csv

# Path to the pdb_nr folder
pdb_folder = "./pdb_nr/"

# Open the CSV file for writing
with open('pdb_sequences.csv', mode='w', newline='') as csv_file:
    # Create a CSV writer
    csv_writer = csv.writer(csv_file)
    
    # Write the header row
    csv_writer.writerow(['tcr_id', 'alpha_seq', 'beta_seq'])

    # Loop over each PDB file in the directory
    for pdb_file in os.listdir(pdb_folder):
        if pdb_file.endswith(".pdb"):
            # Construct the full file path
            pdb_file_path = os.path.join(pdb_folder, pdb_file)
            
            # Extract the PDB ID from the file name
            pdb_id = pdb_file.split(".")[0]
            
            # Extract the sequences (adjust this function as per your actual logic)
            alpha_seq, beta_seq, epitope = extract_specific_sequences(pdb_file_path, seq_dict)
            
            # Write the PDB ID, alpha sequence, and beta sequence to the CSV file
            csv_writer.writerow([pdb_id, alpha_seq, beta_seq])

print("CSV file written successfully!")

# Find similar TCRs for sequence data

In [4]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

dataframe=pd.read_csv("./structures_annotation/TCRdist_df.csv")
common_df=pd.read_csv("./seqs_info/seqs_test_groups.csv")

# Open the file once outside the loop
file_path = "./seqs_info/closest_tcr.csv"

# Leer el archivo existente para verificar duplicados
existing_tcrs = set()
if os.path.exists(file_path):
    with open(file_path, "r") as f:
        # Leer el contenido y almacenar los TCR_names existentes en un conjunto
        for line in f:
            if line.startswith("tcr_name"):
                continue  # Saltar el encabezado
            tcr_name_existing = line.split(",")[0]  # Obtener el tcr_name de cada línea
            existing_tcrs.add(tcr_name_existing)

# Abrir el archivo para escribir
with open(file_path, "a") as f:
    # Escribir encabezados si el archivo está vacío
    if os.stat(file_path).st_size == 0:
        f.write("tcr_name,closest_tcr\n")
    
    # Iterar a través de las filas del DataFrame
    for _, row_data in common_df.iterrows():
        alpha_seq = row_data['TRA_aa']
        beta_seq = row_data['TRB_aa']
        tcr_name = row_data['TCR_name']
        
        # Verificar si el TCR ya existe en el archivo
        if int(tcr_name) in existing_tcrs:
            print(f"Skipping {tcr_name}, already exists in closest_tcr.csv.")
            continue
        
        # Encontrar el TCR más cercano
        closest_tcr = find_closest_tcr(dataframe, alpha_seq, beta_seq, tcr_name)
        
        # Escribir el resultado en el CSV
        f.write(f"{tcr_name},{closest_tcr}\n")
        
        # Opcionalmente imprimir el resultado para confirmación
        print(f"Closest TCR for {tcr_name}: {closest_tcr}")

Closest TCR for 32145: 5tez
Closest TCR for 1333: 2nx5
Closest TCR for 31905: 5tez
Closest TCR for 32308: ['4g9f', '4g8g']
Closest TCR for 21404: 3vxm
Closest TCR for 30362: 5d2n
Closest TCR for 1037: 5d2l
Closest TCR for 32953: 3tjh
Closest TCR for 32560: 8i5d
Closest TCR for 1199: 6bj2
Closest TCR for 32448: 6rpa
Closest TCR for 33472: ['7nmg', '7nme', '7nmf']
Closest TCR for 31880: 7phr
Closest TCR for 764: ['6bj3', '6bj8', '5xot']
Closest TCR for 32447: ['7n5c', '7n4k', '3pqy', '7n5p']
Closest TCR for 21442: ['6vma', '6vmc', '6vm9']
Closest TCR for 30610: 5sws
Closest TCR for 32401: 4mji
Closest TCR for 32737: 8qfy
Closest TCR for 33256: ['7n2r', '7n2p', '7n2q']
Closest TCR for 21410: ['5jzi', '5yxn']
Closest TCR for 33086: 5men
Closest TCR for 32667: 1bd2
Closest TCR for 32215: ['6vm7', '6vm8']
Closest TCR for 1016: 4eup
Closest TCR for 32382: 7byd
Closest TCR for 32508: 4qrr
Closest TCR for 33049: ['7nmg', '7nme', '7nmf']
Closest TCR for 33050: 6uon
Closest TCR for 21448: 7pbe
Cl

In [None]:
# 20426,6p64 (last row)

# Find similar TCR for structural data (pdb_files)

In [6]:
import pandas as pd
import os
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the dataframes
dataframe = pd.read_csv("./structures_annotation/TCRdist_df.csv")
pdb_common = pd.read_csv("./input/sequences_similar_allinfo.csv")

# Initialize a list to collect errors
error_list = []

# Load the output file if it already exists, otherwise create an empty DataFrame
output_file = "./structures_annotation/closest_tcr_sequences.csv"
if os.path.exists(output_file) and os.stat(output_file).st_size > 0:
    processed_df = pd.read_csv(output_file)
    processed_tcr_names = set(processed_df['tcr_name'])  # Set of already processed TCR names
else:
    processed_tcr_names = set()

# Open the output file in append mode
with open(output_file, "a") as f:
    # Write headers if the file is empty
    if os.stat(output_file).st_size == 0:
        f.write("tcr_name,closest_tcr\n")
    
    # Iterate through rows of the pdb_common DataFrame
    for _, row_data in pdb_common.iterrows():
        tcr_name = row_data['tcr_id']

        # Skip processing if the tcr_name is already in the output file
        if tcr_name in processed_tcr_names:
            print(f"Skipping {tcr_name}, already processed.")
            continue
        
        alpha_seq = row_data['alpha_seq']
        beta_seq = row_data['beta_seq']
        
        try:
            # Find the closest TCR
            closest_tcr = find_closest_tcr(dataframe, alpha_seq, beta_seq, tcr_name)
            
            # Write the result to the CSV
            f.write(f"{tcr_name},{closest_tcr}\n")
            
            # Optionally print the result for confirmation
            print(f"Closest TCR for {tcr_name}: {closest_tcr}")
        
        except Exception as e:
            # Log the error and the corresponding TCR name
            error_message = f"Error processing {tcr_name}: {str(e)}"
            error_list.append(error_message)
            print(error_message)  # Optionally print the error

# At the end of processing, print or log the errors if needed
if error_list:
    print("\nErrors encountered:")
    for error in error_list:
        print(error)

FileNotFoundError: [Errno 2] No such file or directory: './input/sequences_similar_allinfo.csv'

# Manual annotation

In [27]:
dataframe=pd.read_csv("./structures_annotation/TCRdist_df.csv")
common_df=pd.read_csv("./structures_annotation/tcr_common.csv")

pdb_path="./pdb_files/5wkf.pdb"
pdb_id=pdb_path.split("/")[-1].split(".")[0]
alpha_seq, beta_seq, epitope = extract_specific_sequences(pdb_path, seq_dict)
print(alpha_seq,beta_seq,epitope)
tcr_id=pdb_id
trab, traj = get_germlines(alpha_seq)
trbv, trvj = get_germlines(beta_seq)
cdr3a, _ = parse_CDR3(run_anarci(alpha_seq, "D"))
cdr3b, _ = parse_CDR3(run_anarci(beta_seq, "E"))

print(pdb_id,cdr3b,trbv,trvj,cdr3a,trab,traj)

QPVQSPQAVILREGEDAIINCSSSKALYSVHWYRQKHGEAPIFLMILLKGGEQKGHDKISASFNEKKQQSSLYLTASQLSYSGTYFCGLGDAGNMLTFGGGTRLMVKPHIQNPDPAVYQLRDSKSSDKSVCLFTDFDSQTNVSQSKDSDVYITDKCVLDMRSMDFKSNSAVAWSNKSDFACANAFNNSIIPEDTFFPS AGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLEDSAVYLCASSLGQGLLYGYTFGSGTRLTVLEDLNKVFPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVCTDPQPLKEQPALNDSRYALSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRAD GTSGSPIVNR
5wkf ASSLGQGLLYGYT TRBV11-2*01 TRBJ1-2*01 GLGDAGNMLT TRAV30*05 TRAJ39*01


In [None]:
dataframe=pd.read_csv("./structures_annotation/TCRdist_df.csv")
closest_tcr = find_closest_tcr(dataframe, alpha_seq, beta_seq, tcr_name)

# FIND SIMILAR MHCs

In [7]:
import os
import csv

general_df = pd.read_csv('./structures_annotation/general.txt', sep='\t')
chain_dict = {}
for pdb_id, group in general_df.groupby('pdb.id'):
    chains = {
        'tcra_chain': None,
        'tcrb_chain': None,
        'peptide_chain': None,
        'mhc_chain': None
    }
    
    for _, row in group.iterrows():
        if row['chain.component'] == 'TCR' and row['chain.type'] == 'TRA':
            chains['tcra_chain'] = row['chain.id']
        elif row['chain.component'] == 'TCR' and row['chain.type'] == 'TRB':
            chains['tcrb_chain'] = row['chain.id']
        elif row['chain.component'] == 'PEPTIDE':
            chains['peptide_chain'] = row['chain.id']
        elif row['chain.component'] == 'MHC' and row['chain.type'] == 'MHCa':
            chains['mhc_chain'] = row['chain.id']
        
    chain_dict[pdb_id] = chains

# Initialize output file
output_file = "./closest_mhc_results.csv"
if os.path.exists(output_file) and os.stat(output_file).st_size > 0:
    processed_df = pd.read_csv(output_file)
    processed_pdb_ids = set(processed_df['pdb_id'])  # Set of already processed PDB IDs
else:
    processed_pdb_ids = set()

# Open output CSV file in append mode
with open(output_file, mode="a", newline="") as csvfile:
    writer = csv.writer(csvfile)

    # Write header only if the file is empty
    if os.stat(output_file).st_size == 0:
        writer.writerow(["pdb_id", "closest_mhc"])  # Write header

    # Iterate through each PDB file in pdb_nr
    for pdb_file1 in os.listdir("./pdb_nr"):
        if pdb_file1.endswith(".pdb"):
            pdb_id1 = pdb_file1.split(".")[0]

            # Skip processing if PDB is already processed
            if pdb_id1 in processed_pdb_ids:
                continue
            
            scores = {}  # Initialize dictionary to store scores for each pdb_file1
            pdb_file_path1 = os.path.join("./pdb_nr", pdb_file1)
            chains1 = chain_dict.get(pdb_id1, {})
            seq_pdb1 = extract_sequences(pdb_file_path1)
            
            for pdb_file2 in os.listdir("./pdb_files"):
                if pdb_file2.endswith(".pdb") and pdb_file2 != pdb_file1:
                    pdb_id2 = pdb_file2.split(".")[0]
                    pdb_file_path2 = os.path.join("./pdb_files", pdb_file2)
                    chains2 = chain_dict.get(pdb_id2, {})
                    seq_pdb2 = extract_sequences(pdb_file_path2)

                    # Perform global alignment between sequences
                    aligned_seq_pdb, aligned_seq_query, score = global_alignment(
                        seq_pdb1[chains1['mhc_chain']], 
                        seq_pdb2[chains2['mhc_chain']]
                    )
                    
                    # Store the score in the dictionary
                    scores[pdb_id2] = score

            # Find the best match with the highest score
            max_score = max(scores.values())
            max_pdb_ids = [pdb_id for pdb_id, score in scores.items() if score == max_score]

            # Format closest_mhc as a Python list with single quotes
            closest_mhc_list = "['" + "', '".join(max_pdb_ids) + "']"

            # Print only the closest TCR in the desired format
            print(f"Closest MHC for {pdb_id1}: {closest_mhc_list}")

            # Write the results to the CSV file
            writer.writerow([pdb_id1, closest_mhc_list])

print("Processing complete. Results saved to closest_mhc_results.csv.")

Closest TCR for 2ckb: ['1g6r']


KeyboardInterrupt: 

In [None]:
import os
import csv
#Sequence df
# df=pd.read_csv("./structures_annotation/X.csv")

# Open a CSV file to write the results
with open("closest_mhc_results.csv", mode="w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["pdb_id", "closest_mhc"])  # Write the header row

    for _, row_data in df.iterrows():
        scores={}
        tcr_name = row_data['tcr_id']
        mch_seq = row_data['mhc_seq']

        print(f"Processing TCR: {tcr_name}")
            
        for pdb_file2 in os.listdir("./pdb_files"):
            if pdb_file2.endswith(".pdb"):
                pdb_id2 = pdb_file2.split(".")[0]
                pdb_file_path2 = os.path.join("./pdb_files", pdb_file2)
                chains2 = chain_dict.get(pdb_id2, {})
                seq_pdb2 = extract_sequences(pdb_file_path2)

                # Perform global alignment
                aligned_seq_pdb, aligned_seq_query, score = global_alignment(mhc_seq, seq_pdb2[chains2['mhc_chain']])
                    
                # Store the score in the dictionary
                scores[pdb_id2] = score
                print(f" Compared {tcr_id} with {pdb_id2}: Score = {score}")

            # Find the maximum score
            max_score = max(scores.values())
            max_pdb_ids = [pdb_id for pdb_id, score in scores.items() if score == max_score]

            # Write the best match(es) to the CSV file
            if len(max_pdb_ids) == 1:
                closest_mhc = max_pdb_ids[0]
                print(f"For PDB {pdb_id1}, the best match is: {closest_mhc} with score {max_score}.")
            else:
                closest_mhc = ", ".join(max_pdb_ids)
                print(f"For PDB {pdb_id1}, the best matches are: {closest_mhc} with score {max_score}.")

            # Write the results to the CSV file
            writer.writerow([pdb_id1, closest_mhc])

print("Processing complete. Results saved to closest_mhc_results.csv.")