In [1]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_database', nprocs=1)

In [4]:
client.insert_property_definition('tag:staff@noreply.openkim.org,2014-04-15:property/bulk-modulus-isothermal-cubic-crystal-npt')

# Verifying dataset correctness

In [2]:
client.datasets.distinct('name')

['Mo_PRM2019']

In [3]:
ds_id = client.datasets.find_one({'name': 'Mo_PRM2019'})['_id']
dataset = client.get_dataset(ds_id)['dataset']
dataset.name, ds_id

('Mo_PRM2019', '2669550080465425079')

In [4]:
client.dataset_to_markdown(
    ds_id=ds_id,
    base_folder='/tmp',
    html_file_name='README',
    data_file_name=dataset.name+'.xyz',
    data_format='mongo',
    yscale='log'
)

In [5]:
dataset = client.dataset_from_markdown(
    html_file_path='/tmp/README',
    verbose=True
)

Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 3785/3785 [00:00<00:00, 12803.98it/s]
Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 10702.79it/s]
Aggregating configuration info: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 10655.58it/s]
Aggregating configuration info: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 210/210 [00:00<00:00, 4688.89it/s]
Aggregating configuration info: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 18036.14it/s]
Aggregating configuration info: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 547/547 [00:00<00:00, 

In [9]:
dataset['dataset'].name

'Mo_PRM2019'

In [10]:
client.dataset_to_markdown(
    ds_id=dataset['_id'],
    base_folder='/tmp',
    html_file_name='README-2',
    data_file_name=dataset['dataset'].name+'-2.xyz',
    data_format='mongo',
    yscale='log'
)

In [6]:
client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids)

# Number of configurations

In [6]:
client.configurations.count_documents({})

3785

# Number of properties

In [7]:
client.properties.count_documents({})

3785

# Number of datasets

In [8]:
client.datasets.count_documents({})

1

# Get configurations

In [9]:
configs = client.get_configurations('all', attach_properties=True, generator=False)
configs[0]

Configuration(symbols='Mo', pbc=True, cell=[[3.16433940431, 0.0, 0.0], [0.0130055598383, 3.1378626146, 0.0], [1.64561579728, 1.4436541415, 1.63718875269]], energy=..., forces=...)

In [11]:
configs[0].info

{'_name': ['slice_sample'],
 '_labels': ['bcc', 'strain'],
 '_constraints': set(),
 '_id': '8823147868664253613',
 'stress': array([[ 3.10459376,  0.78835593,  3.54358143],
        [ 0.78835593,  1.45680226, -7.42855644],
        [ 3.54358143, -7.42855644, 11.9741297 ]])}

In [None]:
"""
Get all Configurations

Get all properties that have a CO ID that's in the query
"""

cursor = client.properties.aggregate([
    {'$unwind': '$relationships.configurations'},
#     {'$match': {'relationships.configurations': {'$in': ids}}},
    {'$lookup': {
        'from': 'configurations',
        'localField': 'relationships.configurations',
        'foreignField': '_id',
        'as': 'linked_co'
    }}
])

next(cursor)

In [None]:
len(list(cursor))

## Names of datasets

In [None]:
sorted(list(client.datasets.find({}, {'name'})), key=lambda x: x['name'].lower())

# Number of configuration sets

In [None]:
client.configuration_sets.count_documents({})

## All configuration sets, and their linked datasets

In [None]:
cursor = client.configuration_sets.aggregate([
    {'$project': {'relationships.datasets': 1}},
    {'$unwind': '$relationships.datasets'},
    {'$project': {'ds_id': '$relationships.datasets'}},
    {'$lookup': {
        'from': 'datasets',
        'localField': 'ds_id',
        'foreignField': '_id',
        'as': 'linked_ds'
    }},
    {'$project': {'ds_name': '$linked_ds.name'}}
])

sorted(list(cursor), key=lambda x: x['ds_name'][0].lower())

## Configuration sets that are tied to more than one dataset

In [None]:
client.configuration_sets.count_documents({'relationships.datasets.1': {'$exists': True}})

In [None]:
cursor = client.configuration_sets.aggregate([
    {'$match': {'relationships.datasets.1': {'$exists': True}}},
    {'$project': {'relationships.datasets': 1}},
    {'$unwind': '$relationships.datasets'},
    {'$project': {'ds_id': '$relationships.datasets'}},
    {'$lookup': {
        'from': 'datasets',
        'localField': 'ds_id',
        'foreignField': '_id',
        'as': 'linked_ds'
    }},
    {'$project': {'ds_name': '$linked_ds.name'}}
])

list(cursor)

# Total number of atoms

In [None]:
next(client.configurations.aggregate([
    {'$group': {'_id': None, 'sum': {'$sum': '$nsites'}}}
]))

# Total number of configuration labels

In [None]:
cursor = client.configurations.aggregate([
#     { "$match": { "_id": { "$in": [1, 2] } } },
    {"$group": {"_id": None, "labels": { "$push": "$labels" }}},
    {
        "$project": {
            "labels": {
                "$reduce": {
                    "input": "$labels",
                    "initialValue": [],
                    "in": { "$setUnion": ["$$value", "$$this"] }
                }
            }
        }
    }
])

labels = next(cursor)['labels']
len(labels)

## Counts for each label

In [None]:
cursor = client.configurations.aggregate([
    {'$unwind': '$labels'},
    {'$group': {'_id': '$labels', 'count': {'$sum': 1}}}
])

sorted(cursor, key=lambda x: x['count'], reverse=True)

# Property distributions

In [None]:
client.properties.find_one({})

In [None]:
client.properties.distinct('type')

In [None]:
all_energies = client.get_data('properties', ['energy-forces-stress.energy', 'energy-forces-virial.energy'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_energies.items():        
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
all_forces = client.get_data('properties', ['energy-forces-stress.forces', 'energy-forces-virial.forces'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_forces.items():
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
all_stresses = client.get_data('properties', ['energy-forces-stress.stress', 'energy-forces-virial.virial'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_stresses.items():
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
client.properties.distinct('energy-forces-stress.stress.source-unit')

In [None]:
client.properties.distinct('energy-forces-virial.virial.source-unit')

In [None]:
client.datasets.find_one({'name': 'WBe_PRB2019'})

In [None]:
client.properties.find_one({'relationships.datasets': '-6889328512840717042'})