# PROTAC Complex Encoding

Collection of ideas and unfinished work. To be ignored for now...

## Features Encoding

EVOformer from AlphaFold may be used for predicting molecule structures. The problem with general molecules is that they do not have a structure as proteins. Proteins are polymers, and each individual amino acid has its own shape, so this information is somehow leveraged by AlphaFold in its predictions.

In [None]:
hyperparameters = {
    'use_morgan_fp': ('categorical', [True, False]),
    'use_maccs_fp': ('categorical', [True, False]),
    'use_path_fp': ('categorical', [True, False]),
    'pathfp_min_path': (int, 1, 32),
    'pathfp_max_path': (int, 1, 64),
    'morgan_bitwidth': (int, 1024, 2048),
    'pathfp_bitwidth': (int, 1024, 2048),
    'morgan_encoder_hidden_sz': (int, 256, 2048),
    'maccs_encoder_hidden_sz': (int, 256, 2048),
    'pathfp_encoder_hidden_sz': (int, 256, 2048),
    'learning_rate': (float, 1e-5, 1e-3),
    'gnn_layer_type': ('categorical', ['GraphConv', 'GCN', 'GAT']),
}

### Encoding SELFIES via ChemGPT (DEPRECATED)

> Christian: SELFIES are not useful, we have already investigated and studied them.

Let's start by installing and importing the required dependencies.

In [None]:
!pip install selfies sentencepiece transformers datasets wandb -qqq

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
import selfies as sf
import wandb

Each entry in the dataset will consist of a pair of the SELFIES encoding and the degradation percentage. In order to get the SELFIES encoding, each SMILES entry (without stereochemistry information) is converted.

In [None]:
entries = [(sf.encoder(x['Smiles_nostereo']), x['degradation']) for x in train_upsampled.to_dict(orient='records')]
df = pd.DataFrame(entries, columns=['text', 'labels'])
train_dataset = Dataset.from_pandas(df, preserve_index=False)

entries = [(sf.encoder(x['Smiles_nostereo']), x['degradation']) for x in test.to_dict(orient='records')]
df = pd.DataFrame(entries, columns=['text', 'labels'])
test_dataset = Dataset.from_pandas(df, preserve_index=False)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})
# dataset

We can now import the tokenizer and tokenize the SELFIES strings in the dataset.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('ncfrey/ChemGPT-4.7M')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
def tokenize_function(entries):
    return tokenizer(entries['text'], padding='max_length', truncation=True, max_length=256, return_tensors='pt')

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])

In [None]:
tokenized_datasets

Next, we can download the pretrained ChemGPT model.

In [None]:
%%capture
# model = AutoModelForCausalLM.from_pretrained('ncfrey/ChemGPT-4.7M', num_labels=1) # Original
model = AutoModelForSequenceClassification.from_pretrained('ncfrey/ChemGPT-4.7M', num_labels=1)

Freeze all un-initialized layers in order to avoid "catastrophic forgetting".

In [None]:
uninit_layers =[
    'score.weight',
    'transformer.h.1.attn.attention.bias',
    'transformer.h.3.attn.attention.bias',
    'transformer.h.5.attn.attention.bias',
    'transformer.h.7.attn.attention.bias',
    'transformer.h.9.attn.attention.bias',
    'transformer.h.11.attn.attention.bias',
    'transformer.h.13.attn.attention.bias',
    'transformer.h.15.attn.attention.bias',
    'transformer.h.17.attn.attention.bias',
    'transformer.h.19.attn.attention.bias',
    'transformer.h.21.attn.attention.bias',
    'transformer.h.23.attn.attention.bias',
]

for name, param in model.named_parameters():
    if name in uninit_layers:
        param.requires_grad = True
        print(name, param.requires_grad)
    else:
        param.requires_grad = False

In [None]:
from datasets import load_metric
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {'rmse': rmse}

In [None]:
%%wandb
from transformers import TrainingArguments, Trainer

model.config.pad_token_id = tokenizer.pad_token_id

training_args = TrainingArguments(
    # label_names='degradation,
    report_to='wandb',
    output_dir='test_trainer',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    num_train_epochs=15,
    save_total_limit=2,
    save_strategy='no')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)
trainer.train()

In [None]:
trainer.state.log_history

In [None]:
predictions = trainer.predict(tokenized_datasets['test'])
g = plt.plot(predictions.predictions, label='predictions')
g = plt.plot(predictions.label_ids, label='label_ids')
g = plt.legend()
g = plt.grid(alpha=0.8)
g = plt.xlabel('Test ID')
g = plt.ylabel('Degradation (%)')
plt.show()

In [None]:
model = AutoModelForCausalLM.from_pretrained('ncfrey/ChemGPT-4.7M', num_labels=1) # Original
model.eval()

From this [post](https://github.com/huggingface/transformers/issues/7540):

> BERT (the base model without any heads on top) outputs 2 things: `last_hidden_state` and `pooler_output`.
> 
> * `last_hidden_state` contains the hidden representations for each token in each sequence of the batch. So the size is `(batch_size, seq_len, hidden_size)`.
> * `pooler_output` contains a "representation" of each sequence in the batch, and is of size `(batch_size, hidden_size)`. What it basically does is take the hidden representation of the [CLS] token of each sequence in the batch (which is a vector of size `hidden_size`), and then run that through the [BertPooler](https://github.com/huggingface/transformers/blob/de4d7b004a24e4bb087eb46d742ea7939bc74644/src/transformers/modeling_bert.py#L498) nn.Module. This consists of a linear layer followed by a Tanh activation function. The weights of this linear layer are already pretrained on the next sentence prediction task (note that BERT is pretrained on 2 tasks: masked language modeling and next sentence prediction). I assume that the authors of the Transformers library have taken the weights from the original TF implementation, and initialized the layer with them. In theory, they would come from [BertForPretraining](https://github.com/huggingface/transformers/blob/de4d7b004a24e4bb087eb46d742ea7939bc74644/src/transformers/modeling_bert.py#L862) - which is BERT with the 2 pretraining heads on top.

In [None]:
import torch
import selfies as sf

smi = protac_db_df.iloc[42]['Smiles_nostereo']
sf = sf.encoder(smi)
print(f'smi: {smi}')
print(f'sf: {sf}')

inputs = tokenizer(sf, return_tensors='pt')
outputs_transformer = model.transformer(**inputs, output_hidden_states=True)
outputs = model(**inputs, output_hidden_states=True)
print(f'Model Transformer output keys: {outputs_transformer.keys()}')
print(f'Model Tranformer+Head output keys: {outputs.keys()}')

In [None]:
outputs['logits'].size()

In [None]:
outputs['hidden_states'] # A tuple

### Encoding POI Sequence (TODO)

#### Encoding POI Sequence via Protein Embeddings

Following this [implementation](https://huggingface.co/Rostlab/prot_bert). There are more models available at this [repository](https://github.com/agemagician/ProtTrans).

In [None]:
rem_rare_amino_acids = lambda seq: re.sub(r'[UZOB]', 'X', seq)
poi_seq = input_df['poi_seq'].apply(rem_rare_amino_acids)
print('POIs:', poi_seq.to_list())

In [None]:
n = 128
tmp = [' [SEP] '.join([seq[i:i+n] for i in range(0, len(seq), n)]) for seq in poi_seq]
tmp[17]

In [None]:
from transformers import T5Tokenizer, T5EncoderModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False) #.to(device)

In [None]:
# Load the model
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)

# only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
model = model.full() if device == 'cpu' else model.half()

In [None]:
# prepare your protein sequences as a list
sequence_examples = ['PRTEINO', 'SEQWENCE [SEP] SEQWENCE']

# replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples = [' '.join(list(re.sub(r'[UZOB]', 'X', sequence))) for sequence in sequence_examples]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding='longest')

input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

In [None]:
input_ids.size()

In [None]:
# generate embeddings
with torch.no_grad():
    embedding_rpr = model(input_ids=input_ids, attention_mask=attention_mask)

# extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7]) 
emb_0 = embedding_repr.last_hidden_state[0, :7] # shape (7 x 1024)
# same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:8])
emb_1 = embedding_repr.last_hidden_state[1, :8] # shape (8 x 1024)

# if you want to derive a single representation (per-protein embedding) for the whole protein
emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)

print(emb_0)
print(emb_1)
print(emb_0_per_protein)

#### Encoding POI Sequence via ProtBERT

Following this [implementation](https://huggingface.co/Rostlab/prot_bert). There are more models available at this [repository](https://github.com/agemagician/ProtTrans).

In [None]:
from transformers import BertModel, BertTokenizer
import re

poi_tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)

Get BERT [output](https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions).

In [None]:
sequence_Example = 'AETCZAO'
sequence_Example = re.sub(r'[UZOB]', 'X', sequence_Example)
poi_tokenizer(sequence_Example, return_tensors='pt')

In [None]:
rem_rare_amino_acids = lambda seq: re.sub(r'[UZOB]', 'X', seq)
poi_seq = input_df['poi_seq'].apply(rem_rare_amino_acids)
print('POIs:', poi_seq.to_list())

**TODO: The sequence max sequence length is at the moment requiring too much RAM to handle it. I truncate it as a temporary workaround.**

In [None]:
import math

def nearest_pow2(x):
    return 1 << (x - 1).bit_length()

longest_seq = max([len(seq) for seq in poi_seq])
seq_max_len = nearest_pow2(longest_seq)
# seq_max_len = 128
poi_tokenizer.max_length = seq_max_len
seq_max_len

In [None]:
BINARY_CLASSIFICATION = False

del_aminoacids = lambda seq: re.sub(r'[UZOB]', 'X', ' '.join(seq))
train_upsampled_with_poi = train_upsampled.copy()
train_upsampled_with_poi['poi_seq'] = train_upsampled_with_poi['poi_seq'].apply(del_aminoacids)
test_with_poi = test.copy()
test_with_poi['poi_seq'] = test_with_poi['poi_seq'].apply(del_aminoacids)

train_dataset = ProtacDataset(train_upsampled_with_poi,
                              poi_tokenizer=poi_tokenizer,
                              binary_classification=BINARY_CLASSIFICATION)
test_dataset = ProtacDataset(test,
                             poi_tokenizer=poi_tokenizer,
                             binary_classification=BINARY_CLASSIFICATION)

In [None]:
class POIEncoder(pl.LightningModule):

    def __init__(self,
                 hidden_size:int=64,
                 n_layers:int=3,
                 batch_size:int=64,
                 learning_rate:float=1e-3):
        super().__init__()
        # Save the arguments passed to init
        self.save_hyperparameters()
        self.__dict__.update(locals()) # Add arguments as attributes
        # Define PyTorch models
        hidden_channels = [hidden_size] * n_layers
        self.extra_features_encoder = MLP(in_channels=3,
                                          hidden_channels=hidden_channels,
                                          norm_layer=nn.BatchNorm1d,
                                          inplace=False,
                                          dropout=0.5)
        self.poi_encoder = BertModel.from_pretrained('Rostlab/prot_bert')
        self.head = nn.Linear(hidden_size + poi_embedding_size, 1)
        # Define loss metrics
        self.val_mse = MeanSquaredError()
        self.test_mse = MeanSquaredError()

    def forward(self, x_in):
        # Ecode "extra" features
        concentrations = x_in['concentrations']
        e3_ligase = x_in['e3_ligase']
        cell_type = x_in['cell_type']
        x = torch.cat((concentrations, e3_ligase, cell_type), dim=-1)
        extra_features_embedding = self.extra_features_encoder(x)
        # Ecode POI sequence
        input_ids = x_in['poi_seq']['input_ids'].squeeze(dim=1)
        token_type_ids = x_in['poi_seq']['token_type_ids'].squeeze(dim=1)
        attention_mask = x_in['poi_seq']['attention_mask'].squeeze(dim=1)
        poi_embedding = self.poi_encoder(input_ids, token_type_ids,
                                         attention_mask)['pooler_output']
        # Run linear head
        x = torch.cat((extra_features_embedding, poi_embedding), dim=-1)
        return self.head(x)

    def step(self, batch, phase='train'):
        y = batch['labels']
        preds = self.forward(batch)
        loss = F.mse_loss(preds, y)
        self.log(f'{phase}_loss', loss, prog_bar=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.step(batch, phase='train')

    def validation_step(self, batch, batch_idx):
        return self.step(batch, phase='val')

    def test_step(self, batch, batch_idx):
        return self.step(batch, phase='test')

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    ####################
    # DATA RELATED HOOKS
    ####################

    # def prepare_data(self):
    #     # download
    #     MNIST(self.data_dir, train=True, download=True)
    #     MNIST(self.data_dir, train=False, download=True)

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=custom_collate)

    def val_dataloader(self):
        return DataLoader(test_dataset, batch_size=self.batch_size, collate_fn=custom_collate)

    def test_dataloader(self):
        return DataLoader(test_dataset, batch_size=self.batch_size, collate_fn=custom_collate)

In [None]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

model = POIEncoder(hidden_size=32,
                   n_layers=3,
                   batch_size=4,
                   learning_rate=1e-6)

callbacks = [
    TQDMProgressBar(refresh_rate=20),
    EarlyStopping(monitor='val_loss', mode='min'),
    ModelCheckpoint(save_weights_only=True, mode='min', monitor='val_loss'),
]

trainer = pl.Trainer(max_epochs=5,
                     gradient_clip_val=0.5,
                     gradient_clip_algorithm='norm',
                     accelerator='auto',
                     devices=1 if torch.cuda.is_available() else None,
                     log_every_n_steps=8,
                     callbacks=callbacks,
                     logger=CSVLogger(save_dir='logs/'))
trainer.fit(model)

In [None]:
trainer.test(ckpt_path='best')

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
del metrics['step']
metrics.set_index('epoch', inplace=True)
display(metrics.dropna(axis=1, how='all').head())
g = sns.relplot(data=metrics, kind='line')
g = plt.grid(alpha=0.7)
plt.show()

In [None]:
predictions = []
y = []
# Make predictions
with torch.no_grad():
    _ = model.eval()
    for batch in model.test_dataloader():
        predictions.extend(model(batch).detach().tolist())
        y.extend(batch['labels'].detach().tolist())
predictions = np.array(predictions).flatten()
y = np.array(y).flatten()
sorted_idx = np.argsort(y)
# Plot predicitons (sorted)
g = plt.plot(predictions[sorted_idx], label='Predicted degradation (%)')
g = plt.plot(y[sorted_idx], label='Reference degradation (%)')
g = plt.legend()
g = plt.grid(alpha=0.8)
g = plt.xlabel('Test ID (sorted by degradation perc.)')
g = plt.ylabel('Degradation (%)')
plt.show()

### LightGBM

> LightGBM can use categorical features as input directly. It doesn’t need to convert to one-hot encoding, and is much faster than one-hot encoding (about 8x speed-up).
>
> Note: You should convert your categorical features to int type before you construct Dataset.

In [None]:
%%capture
!pip install optuna

In [None]:
import os, sys

src_dir = os.path.join('/content/drive/', 'MyDrive', 'Colab Notebooks', 'thesis', 'src')
sys.path.append(src_dir)

In [None]:
import optuna
import lightgbm as lgb
from binary_label_metrics import BinaryLabelMetrics

optuna.logging.set_verbosity(optuna.logging.WARN) #INFO, WARN

In [None]:
prm = {
  # Number of Optuna trials
  'NTRIALS': 20,
  # Number of boosted trees to be created
  'NBOOST': 50,
  # Number of classes in response variable
  'NCLASS': 2,
  # Morgan fingerpring bit length
  'FP_BITS': 1024,
}

offs = max(map(len, prm.keys()))
print('Parameters:')
for k, v in prm.items():
    print(f'\t{k:>{offs}}: {v}')

In [None]:
class LightGBMObjective(object):
    def __init__(self, X_train, y_train):
        self.best_booster = None
        self._booster = None
        self.X = X_train
        self.y = y_train
        self.dtrain = lgb.Dataset(self.X, self.y)
        self.nclass = 2
        self.prm_lgb = {      
              'objective': 'multiclass' if self.nclass > 2 else 'binary',
              'metric': None, 
              'verbosity': -1,
              'boosting_type': 'gbdt',
              'force_row_wise': True,
              'min_gain_to_split': .5,
        }
  
    def __call__(self, trial):    
        if self.nclass > 2:
            def f1_eval(preds, dataset):
                y = preds.reshape(-1, self.nclass).argmax(axis=1)
                f_score = f1_score(dataset.get_label(), y, average='micro')
                return 'f1_score', f_score, True
        else:
            def f1_eval(preds, dataset):
                pred1 = np.zeros(dataset.get_label().shape[0], dtype=int)
                pred1[-dataset.get_label().sum().astype(int):] = 1
                f_score = f1_score(dataset.get_label()[preds.argsort()], pred1,
                                   average='micro')
                return 'f1_score', f_score, True

        trial_prm = {
            'learning_rate': trial.suggest_float('learning_rate', .01, .3, log=True),
            'lambda_l1': trial.suggest_float('lambda_l1', 1E-3, 1., log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', .5, 3., log=True),
            'num_leaves': trial.suggest_int('num_leaves', 8, 32),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 100),
            'feature_fraction': trial.suggest_float('feature_fraction', .3, .6),
            'bagging_fraction': trial.suggest_float('bagging_fraction', .4, 1.),
            'bagging_freq': trial.suggest_int('bagging_freq', 2, 6),
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf'])
        }
        prm_lgb = dict(self.prm_lgb)
        prm_lgb.update(trial_prm)
        eval_hist = lgb.cv(prm_lgb, self.dtrain, nfold=5, seed=12345,
                           num_boost_round=prm['NBOOST'], feval=f1_eval,
                           callbacks=[lgb.early_stopping(20, verbose=False)])
        return eval_hist['f1_score-mean'][-1]

    def callback(self, study, trial):
        if study.best_trial == trial:
            print(f'{study.best_trial.number} ({study.best_trial.values[0]:.3f}) -> ', end=' ', flush=True)
            self.best_booster = self._booster
            return
        if trial.number % 20 == 0:
            print(f'{trial.number}', end=' ', flush=True)

Only consider the features that the model can process, _e.g._, get rid of the SMILES, which are strings. Also, remove the degradation percentage, since we are trying to predict the "binarized" version of it.

TODO: From investigating the feature importance, we see that the _concentration_ is actually the most important one. Because of that, it is removed in the following experiments.

TODO: If we remove the _concentration_, however, we might have many different entries with the _same data_ in the remaining columns.

In [None]:
removed_features = [
    'degradation', # NOTE: Must be removed, it's the "regression version" of y
    # 'concentration',
    'Smiles',
    'Smiles_nostereo',
    'poi_seq',
]

Instantiate Optuna objective and start optimization, _i.e._, training.

In [None]:
X = X_train_upsampled.drop(removed_features, axis=1)
objective = LightGBMObjective(X, y_train_upsampled)
print(f"Number of trials: {prm['NTRIALS']}")
print(f"Trial ID (F1 score): ", end='')
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
                            sampler=optuna.samplers.TPESampler(seed=1234),
                            direction='maximize')
study.optimize(objective, n_trials=prm['NTRIALS'], callbacks=[objective.callback])

In [None]:
attributes = ('params', 'user_attrs', 'value', 'duration')
trials_df = study.trials_dataframe(attrs=attributes)
for y in ['params', 'user_attrs']:
    trials_df.columns = [x[1 + len(y):] if x.startswith(y) else x for x in trials_df.columns]
trials_df['duration'] = trials_df['duration'].apply(lambda x: x.total_seconds())

with pd.option_context('display.max_rows', 6, 'display.float_format', '{:.4f}'.format):
    display_html(trials_df.sort_values('value', ascending=False))

In [None]:
print(f"Training time: {trials_df['duration'].sum() / 60:.1f}min")

In [None]:
trials_df.groupby('boosting_type').agg(meanv=('value', 'mean'), sdv=('value', 'std'))

Identify best model and re-train it.

In [None]:
best_idx = np.argmax(trials_df['value'].values)
lgb_prm = study.trials[best_idx].params
lgb_prm.update({
    'objective': 'multiclass',
    'metric': None,
    'num_class': 2,
    'force_row_wise': True, 
    'verbosity': -1,
    'min_gain_to_split': .5,
})

def f1_eval(preds, dtrain):
    preds = preds.reshape(prm['NCLASS'], -1).T.argmax(axis=1)
    f_score = f1_score(dtrain.get_label(), preds, average='micro')
    return 'f1_score', f_score, True

# Get dataset
# Balance classes via class weighting (unnecessacy, we are already upsampling)
# wt = class_weight.compute_sample_weight(class_weight='balanced', y=y)
# dtrain = lgb.Dataset(X, y, weight=wt)
X = X_train_upsampled.drop(removed_features, axis=1)
dtrain = lgb.Dataset(X, y_train_upsampled)
model = lgb.train(lgb_prm, dtrain, feval=f1_eval, num_boost_round=prm['NBOOST'])
model

#### Evaluation

In [None]:
X = X_test.drop(removed_features, axis=1)
y_hat = np.array([val[1] for val in model.predict(X)])
scores_df = pd.DataFrame({'label': list(y_test), 'score': list(y_hat)})

blm = BinaryLabelMetrics()
blm.add_model('binary_gbm', scores_df)

In [None]:
blm.plot_roc(params={'legloc': 4})

In [None]:
# blm.plot(chart_types=[2, 5], params={'legloc': 2, 'chart_thresh': 0.5})

In [None]:
X = X_test.drop(removed_features, axis=1)
y_pred = np.array([0 if f1 >= 50 else 1 for _, f1 in model.predict(X)])
conf_mat = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat,
                              display_labels=['inactive', 'active'])
g = disp.plot(cmap=plt.cm.Blues)
g = plt.title(f'Confusion Matrix')
# plt.savefig(os.path.join(fig_dir, f'conf_mat_{model_type}.pdf'))
plt.show()

[Plot feature importance](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html#lightgbm.plot_importance).

In [None]:
g = lgb.plot_importance(model, max_num_features=20, figsize=(9, 7))

In [None]:
for c in train.columns:
    if 'dss' in c:
        print(c)

## Metrics on Hold-Out Test Set

In [None]:
y_hat = np.array([val[1] for val in model.predict(X_test)])
print(y_hat.shape, y_test.shape)
scores_df = pd.DataFrame({'label': list(y_test), 'score': list(y_hat)})

In [None]:
blm = BinaryLabelMetrics()
blm.add_model('binary_gbm', scores_df)

In [None]:
blm.plot()

In [None]:
blm.plot_roc()

In [None]:
import math

l = blm._f1[0]
l = [x for x in l if math.isnan(x) == False]
original = ['original', blm._auc, blm._prrec, max(l)]

## Feature Importances

In [None]:
def shuffle(var, df):
    if var == 'cellType':
        cols = test.columns.tolist()
        ct_cols = [c for c in cols if 'ct' in c]
        new_df = df.copy(deep=True)
        for col in ct_cols:
            new_df[col] = new_df[col].sample(frac=1).values
        return new_df
    elif var == 'e3':
        cols = test.columns.tolist()
        ct_cols = [c for c in cols if 'e3' in c]
        new_df = df.copy(deep=True)
        for col in ct_cols:
            new_df[col] = new_df[col].sample(frac=1).values
        return new_df
    elif var == 'ligand':
        cols = test.columns.tolist()
        ct_cols = [c for c in cols if 'sm' in c]
        new_df = df.copy(deep=True)
        for col in ct_cols:
            new_df[col] = new_df[col].sample(frac=1).values
        return new_df
    elif var == 'receptor':
        cols = test.columns.tolist()
        ct_cols = [c for c in cols if (len(c) == 2 or len(c) == 3) and 'sm' not in c]
        new_df = df.copy(deep=True)
        for col in ct_cols:
            new_df[col] = new_df[col].sample(frac=1).values
        return new_df

In [None]:
import math

metrics = [['shuffled_var', 'auc', 'pr_rec', 'f1']]
for var in ['cellType', 'e3', 'ligand', 'receptor']:
    new_df = shuffle(var, test)
    X_test = new_df.drop(['resp_categorical', 'resp', 'Smiles'], axis=1).values
    y_hat = np.array([val[1] for val in model.predict(X_test)])
    print(y_hat.shape, y_test.shape, y_hat)
    scores_df = pd.DataFrame({'label': list(y_test), 'score': list(y_hat)})
    blm = BinaryLabelMetrics()
    blm.add_model('binary_gbm', scores_df)
    lst = blm._f1[0]
    newlist = [x for x in lst if math.isnan(x) == False]
    metrics.append([var, blm._auc, blm._prrec, max(newlist)])
metrics.append(original)

In [None]:
metrics

## POI Encoding