In [1]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_path = '/content/drive/My Drive/dataset'

    # Install required packages
    !pip install pymatgen

else:
    dataset_path = 'dataset'

In [2]:
from pathlib import Path
import ast
import numpy as np
import pandas as pd
from pymatgen.core import Structure, PeriodicSite, DummySpecie
from pymatgen.analysis.local_env import MinimumDistanceNN


In [4]:
# Get defective structure
defective_file_path = Path(f"{dataset_path}/high_GaSe/cifs/GaSe_Ga72Se69S1_90b1b09f-acf0-46e6-8173-a2e71c884054.cif")
defective_structure = Structure.from_file(defective_file_path)

# Get reference structure
ref_file_path = Path(f"{dataset_path}/high_GaSe/GaSe.cif")
ref_unit_cell = Structure.from_file(ref_file_path)
reference_structure = ref_unit_cell.make_supercell([6,6,1])





In [4]:
def struct_to_dict(structure):
    list_of_sites = structure.sites
    list_of_frac_coords = np.round(structure.frac_coords,3)
    structure_dict = {i: j for i, j in zip(list_of_sites, list_of_frac_coords)}
    return structure_dict

def get_index(ref_struct, a_site):
    for index, site in enumerate(ref_struct.sites):
        if np.array_equal(site.coords, a_site.coords):
            return index


def get_defects_structure(defective_struct, reference_struct):
    copy_defective_struct = defective_struct.copy()
    mindnn = MinimumDistanceNN()
    # struct to dict
    defective_dict = struct_to_dict(copy_defective_struct)
    reference_dict = struct_to_dict(reference_struct)

    # Get lattice of defective structure
    structure_lattice = copy_defective_struct.lattice

    # List to add all defect sites
    defects_list = []

    # Dictionary to hold properties of each defect site
    defects_properties = {} 

    for ref_site, ref_coords in reference_dict.items():
        matching = False
        for def_site, def_coords in defective_dict.items():
            if np.array_equal(ref_coords, def_coords):
                matching = True
                if ref_site.specie != def_site.specie: # Substitution case
                    # Add site to defects list
                    defects_list.append(def_site)

                    # Get atomic number change and defect type
                    add_property = {"original_an":ref_site.specie.Z,
                                    "new_an": def_site.specie.Z,
                                    "an_change": def_site.specie.Z - ref_site.specie.Z,
                                    "vacancy_defect": 0.0,
                                    "substitution_defect": 1.0,
                                    "bonds_broken": 0.0}
                    defects_properties[def_site] = add_property

        if not matching: # Vacancy case
            # Add site to defective structure
            vacant_site = PeriodicSite(
                species= DummySpecie(),
                coords= ref_coords,
                coords_are_cartesian= False, 
                lattice= structure_lattice
                )
            
            # Add site to defects list
            defects_list.append(vacant_site)

            # Get atomic number change and defect type
            add_property={"original_an":ref_site.specie.Z,
                          "new_an": 0,
                          "an_change": 0 - ref_site.specie.Z,
                          "vacancy_defect": 1.0,
                          "substitution_defect": 0.0,
                          "bonds_broken": mindnn.get_cn(reference_struct, get_index(reference_struct, ref_site))}
            defects_properties[vacant_site] = add_property

    # create a defects structure
    defects_struct = Structure.from_sites(defects_list)

    # Add properties to defects structure
    for a_site in defects_struct.sites:
        if a_site in defects_properties.keys():
            a_site.properties.update(defects_properties[a_site])
        else:
            pass

    return defects_struct

# defects_structure = get_defects_structure(defective_structure, reference_structure)
# print(defects_structure)

In [None]:
the_csv = pd.read_csv(f"{dataset_path}/combined/high_GaSe.csv")
for i in range(len(the_csv)):
    defective_file_path = Path(f"{dataset_path}/{the_csv['dataset_material'][i]}/cifs/{the_csv['_id'][i]}.cif")
    defective_structure = Structure.from_file(defective_file_path)

    ref_file_path = Path(f"{dataset_path}/{the_csv['dataset_material'][i]}/{the_csv['base'][i]}.cif")
    ref_unit_cell = Structure.from_file(ref_file_path)
    reference_structure = ref_unit_cell.make_supercell(ast.literal_eval(f"{the_csv['cell'][i]}"))

    def_struct = get_defects_structure(defective_structure, reference_structure)
    print(def_struct)
    

In [50]:
def get_nodes_edges(structure):
    sites_list = structure.sites

    # The nodes: These are the sites features
    nodes = []
    for i, site in enumerate(sites_list):
        node_features = [i, site.properties["bonds_broken"], site.properties["original_an"], site.properties["new_an"],
                         site.properties["an_change"], site.properties["vacancy_defect"],
                         site.properties["substitution_defect"]]
        # Node features syntax
        '''[index of site, number of nearset neighbors to site, Z_before defect,
        Z_after defect, Z_change, is site vac_site(1  for yes, 0 for no), is site sub_site(1 for yes, 0 for no)]'''
        nodes.append(node_features)
         

    # The edges
    edges = [] # The sites in relation
    edge_features = [] # The distance between each site
    
    for i, site_i in enumerate(sites_list):
        for j, site_j  in enumerate(sites_list):
            if i != j:
                edges.append([i,j])
                dist = site_i.distance(site_j)
                edge_features.append([dist])

    return nodes, edges, edge_features

the_nodes, the_edges, the_edge_features = get_nodes_edges(defects_structure)
print(the_nodes)
print(the_edges)
print(the_edge_features)

[[0, 0.0, 34, 16, -18, 0.0, 1.0], [1, 3, 34, 0, -34, 1.0, 0.0], [2, 3, 34, 0, -34, 1.0, 0.0]]
[[0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1]]
[[np.float64(13.218612878118082)], [np.float64(6.143837688715257)], [np.float64(13.218612878118082)], [np.float64(11.185157478479823)], [np.float64(6.143837688715257)], [np.float64(11.185157478479823)]]


In [None]:
def get_gf_tf(df, cif_id):
    # The global attributes
    global_attributes = ["energy","fermi_level","total_mag","formation_energy",
                         "energy_per_atom","E_1","vacancy_sites",
                         "substitution_sites","defect_sites"
                         ]
    
    target_attribute = "band_gap_value"
    
    # Get index of cif id
    index_value = df[df["_id"] == cif_id].index.values[0]
    
    # Use index to get values of global attributes
    gf = [df[i][index_value] for i in global_attributes]

    # Use index to get value of target attribute
    tf = [df[target_attribute][index_value]]
        
    return gf,tf

# an_id = dataset_df['_id'][an_index]
an_id = "GaSe_Ga72Se69S1_90b1b09f-acf0-46e6-8173-a2e71c884054"
the_gf, the_tf = get_gf_tf(dataset_df, an_id)
print(the_gf, the_tf)

`nodes` = node features (matrix, one row per node/atom)

`edges` = connectivity (which atom/node connects to which node/atom)

`edge_features` = edge features (distance between every atom/node)

`y` = target (band gap)

`u` = global features (formation energy, total_mag, etc.)