In [1]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_database', nprocs=1)#, drop_database=True)

In [2]:
configurations = load_data(
    file_path='/home/jvita/scripts/colabfit/data/gubaev/AlNiTi/train_1st_stage.cfg',
    file_format='cfg',
    name_field=None,
    elements=['Al', 'Ni', 'Ti'],
    default_name='train_1st_stage',
    verbose=True,
)


configurations += load_data(
    file_path='/home/jvita/scripts/colabfit/data/gubaev/AlNiTi/train_2nd_stage.cfg',
    file_format='cfg',
    name_field=None,
    elements=['Al', 'Ni', 'Ti'],
    default_name='train_2nd_stage',
    verbose=True,
)

Loading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2393/2393 [00:00<00:00, 13279.17it/s]
Loading data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 976/976 [00:00<00:00, 12402.12it/s]


In [3]:
client.insert_property_definition({
    'property-id': 'energy-forces-stress',
    'property-title': 'A default property for storing energies, forces, and stress',
    'property-description': 'Energies and forces computed using DFT',

    'energy': {'type': 'float', 'has-unit': True, 'extent': [],      'required': False, 'description': 'Cohesive energy'},
    'forces': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Atomic forces'},
    'stress': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Stress'},
})



In [4]:
property_map = {
    'energy-forces-stress': {
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'forces', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'GPa'},
    }
}

In [5]:
pso = PropertySettings(
    method='VASP',
    description='energies/forces/stresses',
    files=None,
    labels=None,
)

In [6]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso},
    generator=False,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

Adding configurations to Database: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 3369/3369 [00:02<00:00, 1493.86it/s]


In [7]:
len(set(all_co_ids))

2666

In [8]:
len(set(all_pr_ids))

2684

In [9]:
cs_regexes = {
    '.*':
        'Configurations generated using active learning by iteratively '\
        'fitting a MTP model, identifying configurations that required the '\
        'MTP to extrapolate, re-computing the energies/forces/structures of '\
        'those configurations with DFT, then retraining the MTP model.',
    'train_1st_stage':
        'Configurations used in the first stage of training',
    'train_2nd_stage':
        'Configurations used in the second stage of training',
}

cs_ids = []

for i, (regex, desc) in enumerate(cs_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc, verbose=True)

    cs_ids.append(cs_id)

Configuration set 0                  (.*):    2666


Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2666/2666 [00:00<00:00, 10273.51it/s]


Configuration set 1     (train_1st_stage):    1706


Aggregating configuration info: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1706/1706 [00:00<00:00, 9259.76it/s]


Configuration set 2     (train_2nd_stage):     963


Aggregating configuration info: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 963/963 [00:00<00:00, 12886.90it/s]


In [10]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='AlNiTi_CMS2019',
    authors=[
        'K. Gubaev', 'E. V. Podryabinkin', 'G. L. W. Hart', 'A. V. Shapeev'
    ],
    links=[
        'https://www.sciencedirect.com/science/article/pii/S0927025618306372?via%3Dihub',
        'https://gitlab.com/kgubaev/accelerating-high-throughput-searches-for-new-alloys-with-active-learning-data',
    ],
    description =  'This dataset was generated using the following active '\
    'learning scheme: 1) candidate structures relaxed by a partially-trained '\
    'MTP model, 2) structures for which the MTP had to perform extrapolation '\
    'are passed to DFT to be re-computed, 3) the MTP is retrained included '\
    'the structures that were re-computed with DFT, 4) steps 1-3 are repeated '\
    'until the MTP does not extrapolate on any of the original candidate '\
    'structures. The original candidate structures for this dataset included '\
    'about 375,000 binary and ternary structures enumerating all possible '\
    'unit cells with different symmetries (BCC, FCC, and HCP) and different '\
    'number of atoms',
    resync=True,
    verbose=True,
)

Aggregating configuration info: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2666/2666 [00:00<00:00, 9782.26it/s]
Aggregating property info:  80%|█████████████████████████████████████████████████████████████████████████████▎                   | 2684/3369 [00:00<00:00, 25903.45it/s]
Updating PR->DS relationships: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 3369/3369 [00:00<00:00, 187039.01it/s]


In [11]:
client.apply_labels(
    dataset_id=ds_id,
    collection_name='configurations',
    query={'_id': {'$in': all_co_ids}},
    labels='active_learning',
    verbose=True
)

Applying configuration labels: 2666it [00:01, 1987.40it/s]


In [12]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2666/2666 [00:00<00:00, 10932.58it/s]
Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1706/1706 [00:00<00:00, 14650.87it/s]
Aggregating configuration info: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 963/963 [00:00<00:00, 10140.81it/s]
Aggregating property info: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2684/2684 [00:00<00:00, 46168.94it/s]

nconfigurations 2666
nsites 24851
nelements 3
chemical_systems ['AlNi', 'Ti', 'Ni', 'AlNiTi', 'NiTi', 'Al', 'AlTi']
elements ['Ni', 'Ti', 'Al']
individual_elements_ratios {'Ni': [0.8, 0.5, 0.67, 0.62, 0.29, 0.43, 0.4, 1.0, 0.75, 0.38, 0.25, 0.42, 0.92, 0.58, 0.18, 0.6, 0.93, 0.11, 0.27, 0.44, 0.91, 0.36, 0.86, 0.04, 0.7, 0.78, 0.2, 0.45, 0.12, 0.17, 0.1, 0.22, 0.71, 0.09, 0.21, 0.79, 0.54, 0.96, 0.88, 0.46, 0.3, 0.55, 0.14, 0.89, 0.64, 0.08, 0.56, 0.73, 0.57, 0.82, 0.9, 0.07, 0.24, 0.41, 0.83, 0.33], 'Ti': [0.2, 0.67, 0.12, 0.71, 0.73, 0.5, 0.75, 0.64, 0.33, 0.38, 0.25, 0.83, 0.92, 0.59, 0.42, 1.0, 0.43, 0.18, 0.6, 0.93, 0.11, 0.94, 0.27, 0.44, 0.91, 0.86, 0.36, 0.04, 0.7, 0.78, 0.45, 0.62, 0.29, 0.17, 0.1, 0.09, 0.22, 0.21, 0.46, 0.79, 0.96, 0.88, 0.3, 0.8, 0.55, 0.89, 0.14, 0.06, 0.08, 0.56, 0.57, 0.4, 0.82, 0.9, 0.07, 0.58], 'Al': [0.5, 0.38, 0.33, 0.57, 0.27, 0.25, 0.4, 0.36, 0.67, 0.62, 0.75, 0.42, 0.92, 0.83, 1.0, 0.18, 0.43, 0.6, 0.93, 0.11, 0.94, 0.44, 0.91, 0.86, 0.78, 0.7, 0.




In [13]:
dataset.aggregated_info['property_fields']

['energy-forces-stress.energy',
 'energy-forces-stress.forces',
 'energy-forces-stress.stress']

In [14]:
client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids)

In [15]:
client.dataset_to_markdown(
    ds_id=ds_id,
    base_folder='/home/jvita/scripts/colabfit-tools/colabfit/examples/'+dataset.name,
    html_file_name='README.md',
    data_format='mongo',
    data_file_name=None,
    yscale='log'
)

In [16]:
# Convert to per-atom energies
client.apply_transformation(
    dataset_id=ds_id,
    property_ids=all_pr_ids,
    update_map={
        'energy-forces-stress.energy':
        lambda f, doc: f/doc['configuration']['nsites']
    },
    configuration_ids=all_co_ids,
)


Modified 2684 properties



<pymongo.results.BulkWriteResult at 0x7f58729ec580>

In [17]:
client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids)