In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import itertools
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Markdown
from matminer.datasets import load_dataset

# from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer
from modnet.models import MODNetModel
from modnet.preprocessing import MODData
from pymatgen.core import Composition

from gptchem.data import get_matbench_is_metal

os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
import modnet

modnet.__version__

'0.1.12'

In [14]:
df = get_matbench_is_metal()

In [15]:
class CompositionOnlyFeaturizer(DeBreuck2020Featurizer):
    def __init__(self):
        super().__init__()
        self.oxid_composition_featurizers = ()
        self.structure_featurizers = ()
        self.site_featurizers = ()

In [17]:
PRECOMPUTED_MODDATA = "./precomputed/expt_is_metal_benchmark_moddata.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    # Use a fresh copy of the dataset
    df = load_dataset("matbench_expt_is_metal")
    # df["composition"] = df["composition"].map(Composition)
    df["structure"] = df["composition"].map(Composition)

    data = MODData(
        materials=df["structure"].tolist(),
        targets=df["is_metal"].tolist(),
        target_names=["is_metal"],
        featurizer=CompositionOnlyFeaturizer(),
        num_classes={"is_metal": 2},
    )
    data.featurize()
    # As this is a small data/feature set, order all features
    data.feature_selection(n=-1)
    data.save(PRECOMPUTED_MODDATA)

2023-02-07 17:27:49,179 - modnet - INFO - Loaded CompositionOnlyFeaturizer featurizer.
2023-02-07 17:27:49,188 - modnet - INFO - Computing features, this can take time...
2023-02-07 17:27:49,190 - modnet - INFO - Applying composition featurizers...
2023-02-07 17:27:49,231 - modnet - INFO - Applying featurizers (AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x1750c8460>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'm

MultipleFeaturizer:   0%|          | 0/4921 [00:00<?, ?it/s]

  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multip

2023-02-07 17:39:38,821 - modnet - INFO - Data has successfully been featurized!
2023-02-07 17:39:38,847 - modnet - INFO - Multiprocessing on 1 workers.
2023-02-07 17:39:38,849 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|██████████| 270/270 [00:05<00:00, 51.90it/s]

2023-02-07 17:39:44,064 - modnet - INFO - Computing cross NMI between all features...



100%|██████████| 19110/19110 [05:00<00:00, 63.58it/s]


2023-02-07 17:44:45,335 - modnet - INFO - Starting target 1/1: is_metal ...
2023-02-07 17:44:45,353 - modnet - INFO - Computing mutual information between features and target...


  mutual_info.loc[:, target_name] = _mifun(df_feat, df_target[target_name], **kwargs)


2023-02-07 17:44:52,667 - modnet - INFO - Computing optimal features...
2023-02-07 17:44:54,164 - modnet - INFO - Selected 50/196 features...
2023-02-07 17:44:55,388 - modnet - INFO - Selected 100/196 features...
2023-02-07 17:44:56,304 - modnet - INFO - Selected 150/196 features...
2023-02-07 17:44:56,569 - modnet - INFO - Done with target 1/1: is_metal.
2023-02-07 17:44:56,570 - modnet - INFO - Merging all features...
2023-02-07 17:44:56,570 - modnet - INFO - Done.
2023-02-07 17:44:58,638 - modnet - INFO - Data successfully saved as ./precomputed/expt_is_metal_benchmark_moddata.pkl.gz!


In [16]:
best_settings = {
    "increase_bs": False,
    "num_neurons": [[128], [32], [32], [16]],
    "n_feat": 120,
    "lr": 0.005,  # 0.005
    "epochs": 100,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "num_classes": {"is_metal": 2},
    "loss": "categorical_crossentropy",
    # "xscale": "standard",
}

In [8]:
data.split

<bound method MODData.split of <modnet.preprocessing.MODData object at 0x1222f4070>>

In [22]:
model = MODNetModel(
    [[["is_metal"]]],
    {"is_metal": 1},
    num_neurons=best_settings["num_neurons"],
    num_classes=best_settings.get("num_classes"),
    act=best_settings.get("act"),
    out_act=best_settings.get("out_act", "linear"),
    n_feat=best_settings["n_feat"],
)

In [23]:
model.fit(
    data,
    epochs=best_settings["epochs"],
    batch_size=best_settings["batch_size"],
    verbose=best_settings["verbose"],
    lr=best_settings["lr"],
    loss=best_settings["loss"],
)

  super().__init__(name, **kwargs)


In [24]:
predictions = model.predict(data)



In [25]:
predictions

Unnamed: 0,is_metal
id0,1
id1,1
id2,0
id3,0
id4,1
...,...
id4916,0
id4917,1
id4918,1
id4919,1


In [28]:
num_train_points = [10, 50, 100, 200, 500, 1000]

DATA = MODData.load(PRECOMPUTED_MODDATA)

OUTDIR = "out_baseline"

NUM_REPEATS = 10

BEST_SETTINGS = {
    "increase_bs": False,
    "num_neurons": [[128], [32], [32], [16]],
    "n_feat": 120,
    "lr": 0.005,  # 0.005
    "epochs": 100,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "num_classes": {"is_metal": 2},
    "loss": "categorical_crossentropy",
    # "xscale": "standard",
}

import time
from pathlib import Path

from fastcore.xtras import save_pickle
from sklearn.model_selection import train_test_split

from gptchem.evaluator import evaluate_classification


def train_test(train_size, seed: int = 42):
    df = get_matbench_is_metal()
    train_idx, test_idx = train_test_split(
        np.arange(len(df)), train_size=train_size, random_state=seed, stratify=df["is_metal"]
    )
    train_data, test_data = DATA.split((train_idx, test_idx))
    assert len(train_data.df_targets) == len(train_idx)
    assert len(test_data.df_targets) == len(test_idx)
    model = MODNetModel(
        [[["is_metal"]]],
        {"is_metal": 1},
        num_neurons=BEST_SETTINGS["num_neurons"],
        num_classes=BEST_SETTINGS.get("num_classes"),
        act=BEST_SETTINGS.get("act"),
        out_act=BEST_SETTINGS.get("out_act", "linear"),
        n_feat=BEST_SETTINGS["n_feat"],
    )

    model.fit(
        train_data,
        epochs=BEST_SETTINGS["epochs"],
        batch_size=BEST_SETTINGS["batch_size"],
        verbose=BEST_SETTINGS["verbose"],
        lr=BEST_SETTINGS["lr"],
        loss=BEST_SETTINGS["loss"],
    )

    predictions = model.predict(test_data)
    assert len(predictions) == len(test_data.df_targets)
    true = test_data.df_targets["is_metal"].values.astype(int)
    pred = predictions["is_metal"].values.astype(int)
    assert len(predictions) == len(test_data.df_targets) == len(pred)
    metrics = evaluate_classification(true, pred)
    print(f"Train size {train_size} - {metrics['accuracy']}")
    metrics["train_size"] = train_size
    metrics["seed"] = seed
    metrics["pred"] = pred
    metrics["true"] = true

    timestr = time.strftime("%Y%m%d-%H%M%S")
    save_pickle(Path(OUTDIR) / f"metrics_{timestr}.pkl", metrics)
    return metrics


for i in range(NUM_REPEATS):
    for train_size in num_train_points:
        train_test(train_size, seed=i)

2023-02-07 17:54:06,089 - modnet - INFO - Loaded <modnet.preprocessing.MODData object at 0x291803be0> object, created with modnet version 0.1.12
Train size 10 - 0.6963958460598656
Train size 50 - 0.7624717717101211
Train size 100 - 0.8041900020742585
Train size 200 - 0.8360516839652616
Train size 500 - 0.8731056322099072
Train size 1000 - 0.8765621015047181
Train size 10 - 0.7613520667888414
Train size 50 - 0.8045575857113529
Train size 100 - 0.8216137730761253
Train size 200 - 0.8345689472569371
Train size 500 - 0.8599864284098621
Train size 1000 - 0.8709512879367508
Train size 10 - 0.7116676847892486
Train size 50 - 0.7947033463354547
Train size 100 - 0.8423563576021572
Train size 200 - 0.8421944503283203
Train size 500 - 0.8617959737615924
Train size 1000 - 0.8780923233868911
Train size 10 - 0.6919161066992466
Train size 50 - 0.8061999589406693
Train size 100 - 0.8027380211574362
Train size 200 - 0.8216479559415378
Train size 500 - 0.8620221669305587
Train size 1000 - 0.874521805661