## GATv2

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from data.featurization.dgl_Graph import DGL_Graph
from model.dgl.GATv2 import GATv2
from sklearn.metrics import root_mean_squared_error, r2_score, cohen_kappa_score
from sklearn.model_selection import KFold
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from collections import defaultdict

In [None]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

### Regression Problems

In [None]:
trn = pd.read_csv("../data/trn.reg.csv.gz", compression='gzip', low_memory=False)
tst = pd.read_csv("../data/tst.reg.csv.gz", compression='gzip', low_memory=False)

trn_X = trn["SMILES"]
tst_X = tst["SMILES"]
trn_y = trn["LogS"]
tst_y = tst["LogS"]

In [None]:
featurizer = DGL_Graph(
    graph_type="BI_GRAPH",
    featurize_type="Canonical",
    self_loop=True
)
trn_X = featurizer.convert(trn_X)
tst_X = featurizer.convert(tst_X)

Hyper-parameter Tuning.

In [None]:
tune_space = {
    "num_heads": hp.randint("num_heads", 2, 6),
    "hidden_feats": hp.choice("hidden_feats", [[64], [64, 32], [128, 64, 32], [64, 64], [128, 64]]),
    "feat_drops": hp.uniform("feat_drops", 0, 1),
    "attn_drops": hp.uniform("attn_drops", 0, 1),
    "alphas": hp.uniform("alphas", 0, 1),
    "residuals": hp.choice("residuals", [True, False]),
    "agg_modes": hp.choice("agg_modes", ["flatten", "mean"]),
    "biases": hp.choice("biases", [True, False]),
    "allow_zero_in_degree": hp.choice("allow_zero_in_degree", [True, False]),
    "share_weights": hp.choice("share_weights", [True, False]),
    "get_attention": False,
    "lr": hp.choice("lr", [0.1, 0.01, 0.001]),
    "weight_decay": hp.uniform("weight_decay", 0, 1),
    "batch_size": 128,
}

In [None]:
def tune_obj(space):
    kf = KFold(n_splits=5, shuffle=True)
    rmse = []

    for trn_idx, val_idx in kf.split(trn_X):
        tX, vX = trn_X[trn_idx], trn_X[val_idx]
        ty, vy = trn_y[trn_idx], trn_y[val_idx]

        model = GATv2(
            n_tasks=1,
            in_feats=featurizer.get_node_feat_size(),
            hidden_feats=space["hidden_feats"],
            num_heads=space["num_heads"],
            feat_drops=space["feat_drops"],
            attn_drops=space["attn_drops"],
            alphas=space["alphas"],
            residuals=space["residuals"],
            biases=space["biases"],
            agg_modes=space["agg_modes"],
            allow_zero_in_degree=space["allow_zero_in_degree"],
            share_weights=space["share_weights"],
            predictor_out_feats=128,
            predictor_dropout=0,
            get_attention=space["get_attention"],
            lr=space["lr"],
            weight_decay=space["weight_decay"],
            batch_size=128,
        )
        model.fit(tX, ty,
                  epochs=800, min_epochs=400, early_stop_epochs=20, verbose=False)
        rmse.append(root_mean_squared_error(vy, model.predict(vX).cpu()))

    return {"loss": np.mean(rmse), 'status': STATUS_OK}

In [None]:
trials = Trials()
best_params = fmin(
    fn=tune_obj,
    space=tune_space,
    algo=tpe.suggest,
    max_evals=500,
    trials=trials
)

In [ ]:
best_params

In [None]:
best_params = {
    "num_heads": 5,
    "hidden_feats": [128, 64],
    "feat_drops": 0.028992495735341864,
    "attn_drops": 0.5312766550561073,
    "alphas": 0.9823528273902176,
    "residuals": True,
    "biases": True,
    "agg_modes": "flatten",
    "allow_zero_in_degree": False,
    "share_weights": False,
    "get_attention": False,
    "lr": 0.001,
    "weight_decay": 0.007911109145324904,
    "batch_size": 128,
}

In [None]:
def k_fold_CV(n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True)
    rmse, r2 = [], []

    for trn_idx, val_idx in kf.split(trn_X):
        tX, vX = trn_X[trn_idx], trn_X[val_idx]
        ty, vy = trn_y[trn_idx], trn_y[val_idx]

        model = GATv2(
            n_tasks=1,
            in_feats=featurizer.get_node_feat_size(),
            hidden_feats=best_params["hidden_feats"],
            num_heads=best_params["num_heads"],
            feat_drops=best_params["feat_drops"],
            attn_drops=best_params["attn_drops"],
            alphas=best_params["alphas"],
            residuals=best_params["residuals"],
            biases=best_params["biases"],
            agg_modes=best_params["agg_modes"],
            allow_zero_in_degree=best_params["allow_zero_in_degree"],
            share_weights=best_params["share_weights"],
            predictor_out_feats=128,
            predictor_dropout=0,
            get_attention=best_params["get_attention"],
            lr=best_params["lr"],
            weight_decay=best_params["weight_decay"],
            batch_size=128,
        )
        model.fit(tX, ty, val_X=vX, val_y=vy, epochs=400)
        pred_val = model.predict(vX).cpu()

        rmse.append(root_mean_squared_error(vy, pred_val))
        r2.append(r2_score(vy, pred_val))

    return pd.DataFrame({"rmse": rmse, "r2": r2})

In [None]:
pd.concat([k_fold_CV() for _ in range(10)])

Prediction and estimation.

In [None]:
def predict():
    kf = KFold(n_splits=5, shuffle=True)
    prediction = []

    for trn_idx, val_idx in kf.split(trn_X):
        tX, vX = trn_X[trn_idx], trn_X[val_idx]
        ty, vy = trn_y[trn_idx], trn_y[val_idx]

        model = GATv2(
            n_tasks=1,
            in_feats=featurizer.get_node_feat_size(),
            hidden_feats=best_params["hidden_feats"],
            num_heads=best_params["num_heads"],
            feat_drops=best_params["feat_drops"],
            attn_drops=best_params["attn_drops"],
            alphas=best_params["alphas"],
            residuals=best_params["residuals"],
            biases=best_params["biases"],
            agg_modes=best_params["agg_modes"],
            allow_zero_in_degree=best_params["allow_zero_in_degree"],
            share_weights=best_params["share_weights"],
            predictor_out_feats=128,
            predictor_dropout=0,
            get_attention=best_params["get_attention"],
            lr=best_params["lr"],
            weight_decay=best_params["weight_decay"],
            batch_size=128,
        )
        model.fit(tX, ty, val_X=vX, val_y=vy, epochs=800, min_epochs=400, early_stop_epochs=20)
        prediction.append(model.predict(tst_X).cpu())

    return [torch.mean(pred_i).item() for pred_i in torch.cat(prediction, 1)]

In [None]:
preds = [predict() for _ in range(10)]
preds = pd.concat([pd.Series(p) for p in preds], axis=1)

In [None]:
rmse, r2 = defaultdict(list), defaultdict(list)

for pred in [preds[c] for c in preds.columns]:
    df = pd.DataFrame({"pred": pred, "set": tst["set"], "true": tst["LogS"]})
    for s in df["set"].unique():
        p = df[df["set"] == s]
        rmse[s].append(root_mean_squared_error(p["true"], p["pred"]))
        r2[s].append(r2_score(p["true"], p["pred"]))

In [None]:
for s in rmse.keys():
    print(f"[{s}] rmse:{np.mean(rmse[s]):.2f}±{np.std(rmse[s]):.2f} r2:{np.mean(r2[s]):.2f}±{np.std(r2[s]):.2f}")

In [None]:
def subplot(x, y, ax):
    ax.scatter(x, y)
    ax.set_xlim((min(min(x), min(y)) - 0.1, max(max(x), max(y)) + 0.1))
    ax.set_ylim((min(min(x), min(y)) - 0.1, max(max(x), max(y)) + 0.1))
    x0, x1 = ax.get_xlim()
    y0, y1 = ax.get_ylim()
    ax.set_aspect(abs(x1 - x0) / abs(y1 - y0))
    ax.grid(which='major', linestyle='--')
    ax.plot([min(min(x), min(y)), max(max(x), max(y))], [min(min(x), min(y)), max(max(x), max(y))], 'k')
    a, b = np.polyfit(x, y, 1)
    y_fit = a * x + b
    ax.plot(x, y_fit)
    ax.set_xlabel("log$S$ Experimental")
    ax.set_ylabel("log$S$ Predicted")


model_name = "GATv2"
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(5 * 3, 5))

df = pd.DataFrame({"pred": preds.iloc[:, np.argmin(np.array(list(rmse.values())).mean(axis=0))],
                   "set": tst["set"], "true": tst["LogS"]})
for s, ax in zip(rmse.keys(), axs):
    idx = tst[tst["set"] == s].index
    t = df["true"].loc[idx]
    p = df["pred"].loc[idx]
    subplot(t, p, ax)
    ax.grid(False)
    ax.set_title(f"{s} ({model_name})   "
                 f"RMSE: {root_mean_squared_error(t, p):.3f}, "
                 f"R$^2$: {r2_score(t, p):.3f}")

## Classification Problem

In [None]:
trn = pd.concat([pd.read_csv(f"../data/trn.EUOS-SLAS.Part{i}.csv.gz") for i in range(1, 9)])
tst = pd.concat([pd.read_csv(f"../data/tst.EUOS-SLAS.Part{i}.csv.gz") for i in range(1, 5)])

trn_X = trn["SMILES"]
tst_X = tst["SMILES"]
trn_y = trn["solubility"]

In [None]:
featurizer = DGL_Graph(
    graph_type="BI_GRAPH",
    featurize_type="Canonical",
    self_loop=True
)
trn_X = featurizer.convert(trn_X)
tst_X = featurizer.convert(tst_X)

Hyper-parameter Tuning.

In [None]:
def tune_obj(space):
    kf = KFold(n_splits=5, shuffle=True)
    qck = []

    for trn_idx, val_idx in kf.split(trn_X):
        tX, vX = trn_X[trn_idx], trn_X[val_idx]
        ty, vy = trn_y[trn_idx], trn_y[val_idx]

        model = GATv2(
            n_tasks=1,
            in_feats=featurizer.get_node_feat_size(),
            hidden_feats=space["hidden_feats"],
            num_heads=space["num_heads"],
            feat_drops=space["feat_drops"],
            attn_drops=space["attn_drops"],
            alphas=space["alphas"],
            residuals=space["residuals"],
            biases=space["biases"],
            agg_modes=space["agg_modes"],
            allow_zero_in_degree=space["allow_zero_in_degree"],
            share_weights=space["share_weights"],
            predictor_out_feats=128,
            predictor_dropout=0,
            get_attention=space["get_attention"],
            lr=space["lr"],
            weight_decay=space["weight_decay"],
            batch_size=128,
        )
        scores = model.fit(tX, ty,
                           val_X=vX, val_y=vy,
                           epochs=800, min_epochs=500, early_stop_epochs=10, verbose=False)
        qck.append(scores["qck"][-1])

    return {"loss": np.mean(qck), 'status': STATUS_OK}

In [None]:
trials = Trials()
best_params = fmin(
    fn=tune_obj,
    space=tune_space,
    algo=tpe.suggest,
    max_evals=500,
    trials=trials
)

In [ ]:
best_params

Prediction and estimation.

In [22]:
def predict():
    kf = KFold(n_splits=5, shuffle=True)
    prediction = []

    for trn_idx, val_idx in kf.split(trn_X):
        tX, vX = trn_X[trn_idx], trn_X[val_idx]
        ty, vy = trn_y[trn_idx], trn_y[val_idx]

        model = GATv2(
            n_tasks=1,
            in_feats=featurizer.get_node_feat_size(),
            hidden_feats=best_params["hidden_feats"],
            num_heads=best_params["num_heads"],
            feat_drops=best_params["feat_drops"],
            attn_drops=best_params["attn_drops"],
            alphas=best_params["alphas"],
            residuals=best_params["residuals"],
            biases=best_params["biases"],
            agg_modes=best_params["agg_modes"],
            allow_zero_in_degree=best_params["allow_zero_in_degree"],
            share_weights=best_params["share_weights"],
            predictor_out_feats=128,
            predictor_dropout=0,
            get_attention=best_params["get_attention"],
            lr=best_params["lr"],
            weight_decay=best_params["weight_decay"],
            batch_size=128,
        )
        model.fit(tX, ty, val_X=vX, val_y=vy, epochs=800, min_epochs=400, early_stop_epochs=20)
        prediction.append(model.predict(tst_X))

    return torch.argmax(torch.stack(prediction).mean(dim=0), dim=1).cpu()

In [None]:
preds = [predict() for _ in range(50)]
preds = pd.DataFrame([p.numpy().tolist() for p in preds]).transpose()
preds

Upload the predictions to the challenge to get feedback on the model's performance.