In [21]:
import itertools
import collections
import random
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tqdm
import pymatgen.core
import pymatgen.io.ase
import pymatgen.analysis.dimensionality
from pymatgen.analysis.local_env import JmolNN


RANDOM_SEED = 1234

pd.options.mode.chained_assignment = None
tqdm.tqdm.pandas()
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [2]:
raw_df = pd.read_pickle("../DigitalEcosystem/raw_data/2d_mat_dataset_raw.pkl")
cols_to_keep = ["bandgap (eV)", "atoms_object (unitless)"]
df = raw_df[cols_to_keep]

In [3]:
df["pymatgen_structure (unitless)"] = df["atoms_object (unitless)"].apply(pymatgen.io.ase.AseAtomsAdaptor.get_structure)

In [20]:
symbols_cols = collections.Counter()
bond_cols = collections.Counter()
angle_cols = collections.Counter()

neighbor_finder = JmolNN()

with tqdm.tqdm(total=len(df)) as pbar:
    for struct in df["pymatgen_structure (unitless)"]:
        symbols_cols.update(struct.symbol_set)
        
        for index, site in enumerate(struct.sites):
            connected = [i['site'] for i in neighbor_finder.get_nn_shell_info(struct, index, 1)]
            
            # Bond counts
            for vertex in connected:
                start, end = sorted([site.specie, vertex.specie])
                bond = f"{start}-{end}"
                bond_cols[bond] += 0.5
                
            # Angles
            for angle_start, angle_end in itertools.combinations(connected,2):
                angle = f"{angle_start.specie}-{site.specie}-{angle_end.specie}"
                angle_cols[angle] += 1
        pbar.update(1)

100%|██████████| 6351/6351 [44:22<00:00,  2.39it/s]  


In [26]:
for filename, obj in (("symbols.pkl", symbols_cols),
                      ("bonds.pkl", bond_cols),
                      ("angles.pkl", angle_cols)):
    with open(filename, "wb") as outp:
        pickle.dump(obj, outp)

In [39]:
print(len(angle_cols))

8728
