In [1]:
import pandas as pd
import os
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [3]:
!git clone https://github.com/adaptyvbio/lemanic_2024.git
!git clone https://github.com/AliSaadatV/PPI_Lemanic_Hackathon.git

Cloning into 'lemanic_2024'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 41 (delta 16), reused 36 (delta 13), pack-reused 0[K
Receiving objects: 100% (41/41), 1.99 MiB | 5.43 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Cloning into 'PPI_Lemanic_Hackathon'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 50 (delta 19), reused 43 (delta 12), pack-reused 0[K
Receiving objects: 100% (50/50), 26.22 MiB | 7.86 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [4]:
#Get the data sets from literature and experiment
root = "lemanic_2024/data"
path_exp_train = root + "/experiment_train.csv"
path_exp_test =  root + "/experiment_test.csv"
path_lit_train = root + "/literature_train.csv"
path_lit_test =  root + "/literature_test.csv"

experiment_train_df = pd.read_csv(path_exp_train)
experiment_test_df = pd.read_csv(path_exp_test)
literature_train_df = pd.read_csv(path_lit_train)
literature_test_df = pd.read_csv(path_lit_test)

experiment_train_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_train_df)+1)]
experiment_test_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_test_df)+1)]
literature_train_df['ID'] = ['id' + str(i) for i in range(1, len(literature_train_df)+1)]
literature_test_df['ID'] = ['id' + str(i) for i in range(1, len(literature_test_df)+1)]


In [5]:
def find_start_indices(df):
    start_indices = []
    for index, row in df.iterrows():
        cdrh3 = row['CDRH3']
        vhorvhh = row['VHorVHH']
        start_index = vhorvhh.find(cdrh3)
        if start_index == -1:
          start_index = float('nan')
        start_indices.append(start_index)
    return start_indices

# literature_train
literature_train_df["start_index_CDRH3"] = find_start_indices(literature_train_df)
experiment_train_df["start_index_CDRH3"] = find_start_indices(experiment_train_df)

experiment_test_df["start_index_CDRH3"] = find_start_indices(experiment_test_df)
literature_test_df["start_index_CDRH3"] = find_start_indices(literature_test_df)

#Stop index
experiment_train_df['stop_index_CDRH3'] = experiment_train_df['start_index_CDRH3'] + experiment_train_df['CDRH3'].str.len()
literature_train_df['stop_index_CDRH3'] = literature_train_df['start_index_CDRH3'] + literature_train_df['CDRH3'].str.len()

literature_test_df['stop_index_CDRH3'] = literature_test_df['start_index_CDRH3'] + literature_test_df['CDRH3'].str.len()
experiment_test_df['stop_index_CDRH3'] = experiment_test_df['start_index_CDRH3'] + experiment_test_df['CDRH3'].str.len()

In [14]:
#tokenizer and model
max_len = 157

tokenizer_heavy = AutoTokenizer.from_pretrained('qilowoq/AbLang_heavy', truncation=True, max_length=max_len)
model_heavy = AutoModel.from_pretrained('qilowoq/AbLang_heavy', trust_remote_code=True)

In [18]:
def get_embeddings_heavy(df, model=model_heavy, tokenizer=tokenizer_heavy):
  X = []
  for index, row in df.iterrows():

    if pd.isna(row['start_index_CDRH3']):
      start = 1
      end = len(seq) + 1
    else:
      start = row["start_index_CDRH3"] + 1
      end = row["stop_index_CDRH3"] + 1

    seq = ' '.join(row['VHorVHH'][0:max_len])
    encoded_input = tokenizer(seq, return_tensors='pt')
    with torch.no_grad():
      model_output = model(**encoded_input).last_hidden_state


    model_output_sliced = model_output[:, int(start):int(end), :]

    embedding = model_output_sliced.mean(dim=1)
    X.append(embedding.squeeze())

  return torch.stack(X).numpy()

In [19]:
# Run only if you want to calculate embeddings
X_heavy_lit_test = get_embeddings_heavy(literature_test_df)
X_heavy_lit_train = get_embeddings_heavy(literature_train_df)

X_heavey_exp_train = get_embeddings_heavy(experiment_train_df)
X_heavey_exp_test = get_embeddings_heavy(experiment_test_df)

In [12]:
def save_variable(var):
  with open(f'{var}.pkl', 'wb') as f:
    pickle.dump(var, f)

In [27]:
with open('X_heavy_lit_test.pkl', 'wb') as f:
    pickle.dump(X_heavy_lit_test, f)

with open('X_heavy_lit_train.pkl', 'wb') as f:
    pickle.dump(X_heavy_lit_train, f)

with open('X_heavey_exp_train.pkl', 'wb') as f:
    pickle.dump(X_heavey_exp_train, f)

with open('X_heavey_exp_test.pkl', 'wb') as f:
    pickle.dump(X_heavey_exp_test, f)

In [6]:
with open('PPI_Lemanic_Hackathon/data/processed/X_heavy_lit_test.pkl', 'rb') as f:
    X_heavy_lit_test = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_heavy_lit_train.pkl', 'rb') as f:
    X_heavy_lit_train = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_heavey_exp_train.pkl', 'rb') as f:
    X_heavy_exp_train = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_heavey_exp_test.pkl', 'rb') as f:
    X_heavy_exp_test = pickle.load(f)

In [17]:
y_heavy_lit_test = np.array(literature_test_df["Binds"].values).astype(int)
y_heavy_lit_train = np.array(literature_train_df["Binds"].values).astype(int)
y_heavy_exp_test = np.array(experiment_test_df["Binds"].values).astype(int)
y_heavy_exp_train = np.array(experiment_train_df["Binds"].values).astype(int)

In [35]:
### Literature dataset
pca = PCA(n_components=128)
X_train = pca.fit_transform(X_heavy_lit_train)
X_test = pca.transform(X_heavy_lit_test)
y_train = y_heavy_lit_train
y_test = y_heavy_lit_test

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

random_f1 = []
non_random_f1 = []
for seed in range(5):
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=seed)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    non_random_f1.append(f1_score(y_test, predictions))
    random_f1_ = []
    for _ in range(100):
        np.random.shuffle(predictions)
        random_f1_.append(f1_score(y_test, predictions))
    random_f1.append(np.mean(random_f1_))
print(f"Non-random F1: {np.mean(non_random_f1):.2f} ± {np.std(non_random_f1):.2f}")
print(f"Random F1: {np.mean(random_f1):.2f} ± {np.std(random_f1):.2f}")

Non-random F1: 0.37 ± 0.01
Random F1: 0.24 ± 0.00


In [38]:
### Experiment dataset
pca = PCA(n_components=128)
X_train = pca.fit_transform(X_heavy_exp_train)
X_test = pca.transform(X_heavy_exp_test)
y_train = y_heavy_exp_train
y_test = y_heavy_exp_test

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

random_f1 = []
non_random_f1 = []
for seed in range(5):
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=seed)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    non_random_f1.append(f1_score(y_test, predictions))
    random_f1_ = []
    for _ in range(100):
        np.random.shuffle(predictions)
        random_f1_.append(f1_score(y_test, predictions))
    random_f1.append(np.mean(random_f1_))
print(f"Non-random F1: {np.mean(non_random_f1):.2f} ± {np.std(non_random_f1):.2f}")
print(f"Random F1: {np.mean(random_f1):.2f} ± {np.std(random_f1):.2f}")

Non-random F1: 0.43 ± 0.02
Random F1: 0.20 ± 0.01


In [14]:
### combine X with df
def combine_X_and_df_H(X, df):
  X_df = pd.DataFrame(X, columns=[f'H{i}' for i in range(1, X.shape[1]+1)])
  merged_df = pd.concat([df, X_df], axis=1)
  return merged_df

literature_test_df_merged = combine_X_and_df_H(X_heavy_lit_test, literature_test_df).dropna(subset=['start_index_CDRH3'])
literature_train_df_merged = combine_X_and_df_H(X_heavy_lit_train, literature_train_df).dropna(subset=['start_index_CDRH3'])
experiment_test_df_merged = combine_X_and_df_H(X_heavy_exp_test, experiment_test_df).dropna(subset=['start_index_CDRH3'])
experiment_train_df_merged = combine_X_and_df_H(X_heavy_exp_train, experiment_train_df).dropna(subset=['start_index_CDRH3'])

### Light Chain

In [19]:
root_l = "PPI_Lemanic_Hackathon/data/raw_light_chain"
path_exp_train_l = root_l + "/experiment_train_vl_cdr.csv"
path_exp_test_l =  root_l + "/experiment_test_vl_cdr.csv"
path_lit_train_l = root_l + "/literature_train_vl_cdr.csv"
path_lit_test_l =  root_l + "/literature_test_vl_cdr.csv"

experiment_train_df_l = pd.read_csv(path_exp_train_l)
experiment_test_df_l = pd.read_csv(path_exp_test_l)
literature_train_df_l = pd.read_csv(path_lit_train_l)
literature_test_df_l = pd.read_csv(path_lit_test_l)

In [5]:
max_len = 157
tokenizer_light = AutoTokenizer.from_pretrained('qilowoq/AbLang_light', truncation=True, max_length=max_len)
model_light = AutoModel.from_pretrained('qilowoq/AbLang_light', trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

AbLang_roberta_model.py:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/qilowoq/AbLang_light:
- AbLang_roberta_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

In [6]:
def get_embeddings_light(df, model=model_light, tokenizer=tokenizer_light):
  X = []
  row_idx_to_exclude = []
  N_repeat = 5
  embed_size = 768
  for index, row in df.iterrows():

    embedding = torch.zeros(1, embed_size)

    for j in range(1, N_repeat+1):
      id = "vl" + str(j)
      start = int(row[id + "_start"])
      end = int(row[id + "_stop"])

      try:
        seq = ' '.join(row[id][0:max_len])
        encoded_input = tokenizer(seq, return_tensors='pt')
        with torch.no_grad():
          model_output = model(**encoded_input).last_hidden_state

      except:
        row_idx_to_exclude.append(index)


      model_output_sliced = model_output[:, int(start):int(end), :]

      embedding += model_output_sliced.mean(dim=1)

    embedding /= N_repeat
    X.append(embedding.squeeze())

  return torch.stack(X).numpy(), row_idx_to_exclude

In [7]:
# Run only if you want to calculate embeddings
X_light_lit_test, rm_row_light_lit_test = get_embeddings_light(literature_test_df_l)
X_light_lit_train, rm_row_light_lit_train = get_embeddings_light(literature_train_df_l)

X_light_exp_train, rm_row_light_exp_test = get_embeddings_light(experiment_train_df_l)
X_light_exp_test, rm_row_light_exp_train = get_embeddings_light(experiment_test_df_l)

In [18]:
def save_variable(variable, name):
  with open(f'{name}.pkl', 'wb') as f:
    pickle.dump(variable, f)

In [19]:
save_variable(X_light_lit_test, "X_light_lit_test")
save_variable(X_light_lit_train, "X_light_lit_train")
save_variable(X_light_exp_test, "X_light_exp_test")
save_variable(X_light_exp_train, "X_light_exp_train")

save_variable(rm_row_light_lit_test, "rm_row_light_lit_test")
save_variable(rm_row_light_lit_train, "rm_row_light_lit_train")

In [20]:
with open('PPI_Lemanic_Hackathon/data/processed/X_light_lit_test.pkl', 'rb') as f:
    X_light_lit_test = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_light_lit_train.pkl', 'rb') as f:
    X_light_lit_train = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_light_exp_test.pkl', 'rb') as f:
    X_light_exp_test = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/X_light_exp_train.pkl', 'rb') as f:
    X_light_exp_train = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/rm_row_light_lit_test.pkl', 'rb') as f:
    rm_row_light_lit_test = pickle.load(f)

with open('PPI_Lemanic_Hackathon/data/processed/rm_row_light_lit_train.pkl', 'rb') as f:
    rm_row_light_lit_train = pickle.load(f)

In [21]:
mask_lit_test = np.ones(len(X_light_lit_test), dtype=bool)
mask_lit_test[rm_row_light_lit_test] = False
X_light_lit_test = X_light_lit_test[mask_lit_test]

mask_lit_train = np.ones(len(X_light_lit_train), dtype=bool)
mask_lit_train[rm_row_light_lit_train] = False
X_light_lit_train = X_light_lit_train[mask_lit_train]

In [12]:
y_light_lit_test = np.array(literature_test_df_l["Binds"].values).astype(int)[mask_lit_test]
y_light_lit_train = np.array(literature_train_df_l["Binds"].values).astype(int)[mask_lit_train]
y_light_exp_test = np.array(experiment_test_df_l["Binds"].values).astype(int)
y_light_exp_train = np.array(experiment_train_df_l["Binds"].values).astype(int)

In [21]:
### Literature dataset
pca = PCA(n_components=128)
X_train = pca.fit_transform(X_light_lit_train)
X_test = pca.transform(X_light_lit_test)
y_train = y_light_lit_train
y_test = y_light_lit_test

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

random_f1 = []
non_random_f1 = []
for seed in range(5):
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=seed)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    non_random_f1.append(f1_score(y_test, predictions))
    random_f1_ = []
    for _ in range(100):
        np.random.shuffle(predictions)
        random_f1_.append(f1_score(y_test, predictions))
    random_f1.append(np.mean(random_f1_))
print(f"Non-random F1: {np.mean(non_random_f1):.2f} ± {np.std(non_random_f1):.2f}")
print(f"Random F1: {np.mean(random_f1):.2f} ± {np.std(random_f1):.2f}")

Non-random F1: 0.28 ± 0.01
Random F1: 0.26 ± 0.00


In [26]:
### Experiment dataset
pca = PCA(n_components=128)
X_train = pca.fit_transform(X_light_exp_train)
X_test = pca.transform(X_light_exp_test)
y_train = y_light_exp_train
y_test = y_light_exp_test

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

random_f1 = []
non_random_f1 = []
for seed in range(5):
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=seed)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    non_random_f1.append(f1_score(y_test, predictions))
    random_f1_ = []
    for _ in range(100):
        np.random.shuffle(predictions)
        random_f1_.append(f1_score(y_test, predictions))
    random_f1.append(np.mean(random_f1_))
print(f"Non-random F1: {np.mean(non_random_f1):.2f} ± {np.std(non_random_f1):.2f}")
print(f"Random F1: {np.mean(random_f1):.2f} ± {np.std(random_f1):.2f}")

Non-random F1: 0.19 ± 0.01
Random F1: 0.17 ± 0.01


In [23]:
### combine X with df
def combine_X_and_df_L(X, df):
  X_df = pd.DataFrame(X, columns=[f'L{i}' for i in range(1, X.shape[1]+1)])
  merged_df = pd.concat([df, X_df], axis=1)
  return merged_df

literature_test_df_merged_l = combine_X_and_df_L(X_light_lit_test, literature_test_df_l)
literature_train_df_merged_l = combine_X_and_df_L(X_light_lit_train, literature_train_df_l)
experiment_test_df_merged_l = combine_X_and_df_L(X_light_exp_test, experiment_test_df_l)
experiment_train_df_merged_l = combine_X_and_df_L(X_light_exp_train, experiment_train_df_l)

Merge Heavy and Light

In [32]:
literature_test_df_HL = pd.merge(literature_test_df_merged, literature_test_df_merged_l, on='ID', how='inner')
literature_train_df_HL = pd.merge(literature_train_df_merged, literature_train_df_merged_l, on='ID', how='inner')
experiment_test_df_HL = pd.merge(experiment_test_df_merged, experiment_test_df_merged_l, on='ID', how='inner')
experiment_train_df_HL = pd.merge(experiment_train_df_merged, experiment_train_df_merged_l, on='ID', how='inner')


In [37]:
literature_test_df_HL.to_csv("literature_test_df_HL.csv", index=False)
literature_train_df_HL.to_csv("literature_train_df_HL.csv", index=False)
experiment_test_df_HL.to_csv("experiment_test_df_HL.csv", index=False)
experiment_train_df_HL.to_csv("experiment_train_df_HL.csv", index=False)