# Pipeline to clean and organise the data before training the model

In [None]:
import os

In [None]:
assert os.path.isfile("Data/Locibase.json")
assert os.path.isfile("Data/esm2_embeddings_rbp.csv")
assert os.path.isfile("Data/phage_host_interactions.csv")
assert os.path.isfile("Data/RBPbase.csv")

Zenodo repository:
https://zenodo.org/records/11061100

Files needed:

- _"Locibase.json"_
- _"esm2_embeddings_rbp.csv"_
- _"phage_host_interactions.csv"_
- _"RBPbase.csv"_


Files generated:
- _"esm2_embeddings_loci_per_protein.csv"_ <br>
Contains the host protein embeddings for each locus protein
<br>
- _"all_interactions_no_embeddings.csv"_ <br>
Contains phage-host interactions, without ESM-2 embeddings (to make it lighter) <br>
- _"kaptive_results.tsv"_ <br>
Contains K-loci information for each host, extracted using Kaptive <br>

# Obtaining individual host proteins

generates "esm2_embeddings_loci_per_protein.csv" from "Locibase.json"

In [None]:
!pip install fair-esm

In [None]:
import torch

torch.__version__

In [None]:
import json
import pandas as pd
import numpy as np
import torch
import esm
from tqdm import tqdm

def compute_esm2_embeddings_loci_per_protein(general_path, data_suffix='', add=False):
    """
    This function computes ESM-2 embeddings for each individual protein within loci, from the Locibase.json file.

    INPUTS:
    - general path to the project data folder
    - data suffix to optionally add to the saved file name (default='')
    OUTPUT: esm2_embeddings_loci_per_protein.csv (with one embedding per protein)
    """

    # Load ESM-2 model
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()  # disables dropout for deterministic results

    # Load json file
    with open(general_path + '/Locibase' + data_suffix + '.json') as dict_file:
        loci_dict = json.load(dict_file)

    # if embeddings already exist, to append new ones to them
    if add:
        old_embeddings_df = pd.read_csv(general_path + '/esm2_embeddings_loci_per_protein' + data_suffix + '.csv')
        processed_accession_proteins = set(zip(old_embeddings_df['accession'], old_embeddings_df['protein_index']))
        for key in list(loci_dict.keys()):
            loci_dict[key] = [seq for i, seq in enumerate(loci_dict[key]) if (key, i) not in processed_accession_proteins]
        print('Processing', sum(len(v) for v in loci_dict.values()), 'more protein sequences (add=True)')

    # Compute embeddings per protein
    protein_representations = []
    accessions = []
    protein_indices = []

    for key in tqdm(loci_dict.keys(), desc="Embedding loci proteins"):
        for idx, sequence in enumerate(loci_dict[key]):
            data = [(f"{key}_prot_{idx}", sequence)]
            batch_labels, batch_strs, batch_tokens = batch_converter(data)
            with torch.no_grad():
                results = model(batch_tokens, repr_layers=[33], return_contacts=True)
            token_representations = results["representations"][33]
            protein_embedding = token_representations[0, 1 : len(sequence) + 1].mean(0).numpy()

            accessions.append(key)
            protein_indices.append(idx)
            protein_representations.append(protein_embedding)

    # Save results
    embeddings_df = pd.concat([
        pd.DataFrame({'accession': accessions, 'protein_index': protein_indices}),
        pd.DataFrame(protein_representations)
    ], axis=1)

    if add:
        embeddings_df = pd.concat([old_embeddings_df, embeddings_df], axis=0, ignore_index=True)

    embeddings_df.to_csv(general_path + '/esm2_embeddings_loci_per_protein' + data_suffix + '.csv', index=False)
    print("Saved embeddings to:", general_path + '/esm2_embeddings_loci_per_protein' + data_suffix + '.csv')

    return embeddings_df


loci_path = "Data" # "path_to_folder_containing_Locibase.json"

compute_esm2_embeddings_loci_per_protein(loci_path)

# Obtaining confirmed infections-only dataset ('all_interactions_no_embeddings.csv')

generates 'all_interactions_no_embeddings.csv' from 'esm2_embeddings_loci_per_protein.csv', 'esm2_embeddings_rbp.csv' and 'phage_host_interactions.csv'

adds to it the protein sequences from "RBPbase.csv", to generate "all_infections.csv"

In [None]:
!pip install xgboost
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from xgboost import XGBClassifier
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, auc, roc_curve
import matplotlib.pyplot as plt
import pickle
import os.path



embeddings_loci_protein = pd.read_csv("Data/esm2_embeddings_loci_per_protein.csv") # generated above
embeddings_rbp = pd.read_csv("Data/esm2_embeddings_rbp.csv")
phage_host_interactions = pd.read_csv('Data/phage_host_interactions.csv')

# Create a single dataset that has host, phage, and interactions, but not embeddings

interactions_melted = phage_host_interactions.melt(
    id_vars=['Unnamed: 0'], var_name='phage_ID', value_name='label'
).rename(columns={'Unnamed: 0': 'accession'})

interactions_melted = interactions_melted.dropna(subset=['label'])

merged = interactions_melted.merge(embeddings_loci_protein, on='accession', how='inner')
merged = merged.merge(embeddings_rbp, on='phage_ID', how='inner')

final_df = merged[['accession', 'phage_ID', 'protein_ID', "label"]]

print(len(final_df))
final_df.drop_duplicates(inplace=True)
final_df.reset_index(drop=True, inplace=True)
print(len(final_df))

final_df.to_csv('Data/all_interactions_no_embeddings.csv', index=False)
print("Final per-protein dataframe saved as 'Data/all_interactions_no_embeddings.csv'.")

In [None]:
# adds Receptor-Binding Proteins to the interactions (no embeddings) file
interactions_no_embeddings = pd.read_csv("Data/all_interactions_no_embeddings.csv")
RBProteins = pd.read_csv("Data/RBPbase.csv")

RBProteins = RBProteins[["protein_ID", "protein_sequence"]]
RBProteins.head()

proteins_no_embeddings = pd.merge(interactions_no_embeddings, RBProteins, how = "left", left_on = "protein_ID", right_on = "protein_ID")

proteins_no_embeddings.to_csv("Data/all_infections.csv", index = False)

# Using Kaptive to determine K-loci

requires the download and unzipping of "klebsiella_genomes.zip"

generates "kaptive_results.tsv"

In [None]:
!pip install kaptive
!apt-get install minimap2

In [None]:
# K-LOCUS EXTRACTION:

!kaptive assembly kpsc_k /path_to_fasta_files/fasta_files/*.fasta -o Data/kaptive_results.tsv -j -p

# 8mins to run

# Downloading concatenated RBPs of phages that infect specific K-loci as fasta files

requires "all_infections.csv" and "kaptive_results.tsv"
generates a .fasta file that contains for each phage infecting a host that belongs to a certain K-locus its proteins, concatenated

In [None]:
!pip install biopython

In [None]:
import pandas as pd
df_sero = pd.read_csv("Data/kaptive_results.tsv", sep="\t")

df_sero.head(2)

In [None]:
df_sero["Best match type"].nunique()
# 1280 + 87

In [None]:
import pandas as pd

all_infections = pd.read_csv("Data/all_infections.csv")
df_sero = pd.read_csv("Data/kaptive_results.tsv", sep="\t")

In [None]:
all_infections.head(2)

In [None]:
# Combine the infections information with the K-loci information
# ("Best match type" refers to the K-locus serotype of the host with that "accession")
df_sero = df_sero[["Assembly", "Best match type", "Match confidence"]]

sero_phage = pd.merge(all_infections, df_sero, how = "left", left_on = "accession", right_on="Assembly").drop("Assembly", axis=1)

sero_phage = sero_phage[sero_phage["Match confidence"] == "Typeable"]

sero_phage = sero_phage[sero_phage["Best match type"] != "Capsule null"]


sero_phage.head(2)