In [20]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_path = '/content/drive/My Drive/dataset'

    # Install required packages
    !pip install pymatgen

else:
    dataset_path = 'dataset'

In [21]:
import pandas as pd
from pymatgen.core import Structure

In [22]:
# Focus on a specific material
mat_dataset = "low_WSe2"
parts = mat_dataset.split("_")
the_material = parts[1]

# Load the data to df
defects_df = pd.read_csv(f"{dataset_path}/{mat_dataset}/defects.csv")
description_df = pd.read_csv(f"{dataset_path}/{mat_dataset}/descriptors.csv")
structure_df = pd.read_csv(f"{dataset_path}/initial_structures.csv")
elements_df = pd.read_csv(f"{dataset_path}/elements.csv")

## Prepare the descriptor df

In [23]:
# Change the column name of the descriptor id column
description_df = description_df.rename(columns={"_id": "descriptor_id"})

# Clearly specify the base for future stratification
description_df["dataset_material"] = mat_dataset

description_df.head()

Unnamed: 0,descriptor_id,description,base,cell,pbc,defects,dataset_material
0,619cdd9644389dee486ade34,X1,WSe2,"(8, 8, 1)","[True, True, False]","[{'type': 'vacancy', 'element': 'Se'}]",low_WSe2
1,619cdd9644389dee486ade35,X2,WSe2,"(8, 8, 1)","[True, True, False]","[{'type': 'vacancy', 'element': 'Se'}, {'type'...",low_WSe2
2,619cdd9644389dee486ade36,X3,WSe2,"(8, 8, 1)","[True, True, False]","[{'type': 'substitution', 'from': 'Se', 'to': ...",low_WSe2
3,619cdd9644389dee486ade37,X4,WSe2,"(8, 8, 1)","[True, True, False]","[{'type': 'substitution', 'from': 'Se', 'to': ...",low_WSe2
4,619cdd9644389dee486ade38,X5,WSe2,"(8, 8, 1)","[True, True, False]","[{'type': 'vacancy', 'element': 'Se'}, {'type'...",low_WSe2


## Clearly represent defects

In [24]:
# Clearly represent the defects in the description_df
def string_to_sites(a_column):
    # Remove unwanted chars
    unwanted_chars = ['[',']']
    for i in unwanted_chars:
        a_column = a_column.replace(i,"")

    # Create a list of the different types of defects
    types = a_column.split("}")
    new_types = [j + "}" for j in types]

    # Remove the additional "{" at the end of the list
    del new_types[-1]

    # Remove the " ," before the "{"
    new_new_types = [types.lstrip(" ,") for types in new_types]

    # Defects clearly represented in 
    list_of_dicts = [eval(dict_string) for dict_string in new_new_types]

    list_of_defects = []
    for i in list_of_dicts:
        if i["type"] == "vacancy":
            defect = f'vacant_{i["element"]}'
            list_of_defects.append(defect)

        elif i["type"] == "substitution":
            defect = f'sub_{i["from"]}_{i["to"]}'
            list_of_defects.append(defect)

        else:
            list_of_defects.append("ubnormal")

    # Create a dictionary of defect_type: number_of_sites
    the_dict = {defect: list_of_defects.count(defect) for defect in list_of_defects}

    return the_dict


def string_to_columns(row):
    dict_defects = string_to_sites(row["defects"])

    for i,j in dict_defects.items():
        row[i] = j

    row.fillna(0.0, inplace=True)
    return row


description_df = description_df.apply(lambda row: string_to_columns(row), axis= 1).fillna(0)
description_df.head()

Unnamed: 0,base,cell,dataset_material,defects,description,descriptor_id,pbc,sub_Se_S,sub_W_Mo,vacant_Se,vacant_W
0,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'Se'}]",X1,619cdd9644389dee486ade34,"[True, True, False]",0.0,0.0,1.0,0.0
1,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",X2,619cdd9644389dee486ade35,"[True, True, False]",0.0,0.0,2.0,0.0
2,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'substitution', 'from': 'Se', 'to': ...",X3,619cdd9644389dee486ade36,"[True, True, False]",1.0,0.0,0.0,0.0
3,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'substitution', 'from': 'Se', 'to': ...",X4,619cdd9644389dee486ade37,"[True, True, False]",2.0,0.0,0.0,0.0
4,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",X5,619cdd9644389dee486ade38,"[True, True, False]",1.0,0.0,1.0,0.0


## Merge the `descriptor.csv` and the `defects.csv` and modify...

In [25]:
# Add description to defects df
merged_df = defects_df.merge(description_df, on="descriptor_id", how="left")

merged_df.head()

Unnamed: 0,_id,descriptor_id,energy,energy_per_atom,fermi_level,homo,lumo,band_gap,homo_lumo_gap,E_1,base,cell,dataset_material,defects,description,pbc,sub_Se_S,sub_W_Mo,vacant_Se,vacant_W
0,619cdd5145851b2ed2430ebf,619cdd9644389dee486ade3e,-1361.430774,-7.165425,-0.498392,-0.3863,-0.1141,0.2165,0.2722,-15.4865,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0
1,619cdd5145851b2ed2430ec0,619cdd9644389dee486ade3d,-1368.593193,-7.165409,-0.471101,-0.321,-0.1073,0.17,0.2137,-15.4813,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V5,"[True, True, False]",2.0,0.0,0.0,1.0
2,619cdd5145851b2ed2430ec1,619cdd9644389dee486ade44,-1378.063994,-7.214995,-0.326761,-0.9632,0.3281,1.2913,1.2913,-61.773,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'substitution', 'from': 'W', 'to': '...",S6,"[True, True, False]",1.0,1.0,1.0,0.0
3,619cdd5145851b2ed2430ec2,619cdd9644389dee486ade3e,-1361.542339,-7.166012,-0.554392,-0.3612,-0.1448,0.2713,0.2164,-15.4841,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0
4,619cdd5145851b2ed2430ec3,619cdd9644389dee486ade3e,-1361.539109,-7.165995,-0.51428,-0.3223,-0.1199,0.3755,0.2024,-15.4861,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0


### Add `band_gap_value`

In [26]:
# Help us get the target band gap value
def remove_majmin(row):
    row["homo"] = (row["homo_majority"] + row["homo_minority"])/2
    row["lumo"] = (row["lumo_majority"] + row["lumo_minority"])/2
    row["E_1"] = (row["E_1_majority"] + row["E_1_minority"])/2

    return row


def get_bgv(row, structure_df, base):
    E_1_pristine = structure_df.loc[structure_df["base"] == base, "E_1"].iloc[0]
    E_vbm_pristine = structure_df.loc[structure_df["base"] == base, "E_VBM"].iloc[0]

    new_norm_homo = row["homo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)
    new_norm_lumo = row["lumo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)

    row["band_gap_value"] = new_norm_lumo - new_norm_homo

    return row


if "2" not in mat_dataset:
    merged_df = merged_df.apply(remove_majmin, axis= 1)
    merged_df = merged_df.apply(lambda row: get_bgv(row, structure_df, the_material), axis=1)

else:
    merged_df = merged_df.apply(lambda row: get_bgv(row,structure_df, the_material),axis=1)

merged_df.head()

Unnamed: 0,_id,descriptor_id,energy,energy_per_atom,fermi_level,homo,lumo,band_gap,homo_lumo_gap,E_1,...,cell,dataset_material,defects,description,pbc,sub_Se_S,sub_W_Mo,vacant_Se,vacant_W,band_gap_value
0,619cdd5145851b2ed2430ebf,619cdd9644389dee486ade3e,-1361.430774,-7.165425,-0.498392,-0.3863,-0.1141,0.2165,0.2722,-15.4865,...,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0,0.2722
1,619cdd5145851b2ed2430ec0,619cdd9644389dee486ade3d,-1368.593193,-7.165409,-0.471101,-0.321,-0.1073,0.17,0.2137,-15.4813,...,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V5,"[True, True, False]",2.0,0.0,0.0,1.0,0.2137
2,619cdd5145851b2ed2430ec1,619cdd9644389dee486ade44,-1378.063994,-7.214995,-0.326761,-0.9632,0.3281,1.2913,1.2913,-61.773,...,"(8, 8, 1)",low_WSe2,"[{'type': 'substitution', 'from': 'W', 'to': '...",S6,"[True, True, False]",1.0,1.0,1.0,0.0,1.2913
3,619cdd5145851b2ed2430ec2,619cdd9644389dee486ade3e,-1361.542339,-7.166012,-0.554392,-0.3612,-0.1448,0.2713,0.2164,-15.4841,...,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0,0.2164
4,619cdd5145851b2ed2430ec3,619cdd9644389dee486ade3e,-1361.539109,-7.165995,-0.51428,-0.3223,-0.1199,0.3755,0.2024,-15.4861,...,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",1.0,0.0,1.0,1.0,0.2024


### Add number of defect sites

In [27]:
# Add strata and ref column
ref_sites_dict = {}

reference_structure = Structure.from_file(f"{dataset_path}/ref_cifs/{mat_dataset}.cif")

# Add number of reference sites to the dictionary
ref_sites_dict[mat_dataset] = reference_structure.num_sites

# Get defect sites that will help in creating the strata column
def get_strata(row, ref_sites_dict):
    # Get the defects in the df
    all_columns = list(row.index)  
    vacant_columns = [col for col in all_columns if "vacant" in col]
    sub_columns = [col for col in all_columns if "sub" in col]
    
    # Get defect:site pair
    vacant_dict = {i:row[i] for i in vacant_columns}
    # row["vacancy_sites"] = sum(vacant_dict.values())
    vacants = sum(vacant_dict.values())

    sub_dict = {i:row[i] for i in sub_columns}
    # row["substitution_sites"] = sum(sub_dict.values())
    subs = sum(sub_dict.values())

    # Get total defect sites
    # row["defect_sites"] = row["vacancy_sites"]+row["substitution_sites"]
    defect_sites = vacants + subs

    total_num_sites = ref_sites_dict[row["dataset_material"]]

    # Get defect concentration
    defect_conc = round(defect_sites/total_num_sites,5)

    # The strata column will be in the form of material type_defect concentration
    row["to_strata"] = f"{row['base']}_{defect_conc}"
    return row



# Replace the specific defect sites with type of defect sites
merged_df = merged_df.apply(lambda row: get_strata(row, ref_sites_dict), axis=1)
merged_df = merged_df.drop(columns=[col for col in merged_df.columns if "vacant_" in col or "sub_" in col])
merged_df.head()



Unnamed: 0,_id,descriptor_id,energy,energy_per_atom,fermi_level,homo,lumo,band_gap,homo_lumo_gap,E_1,base,cell,dataset_material,defects,description,pbc,band_gap_value,to_strata
0,619cdd5145851b2ed2430ebf,619cdd9644389dee486ade3e,-1361.430774,-7.165425,-0.498392,-0.3863,-0.1141,0.2165,0.2722,-15.4865,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",0.2722,WSe2_0.01562
1,619cdd5145851b2ed2430ec0,619cdd9644389dee486ade3d,-1368.593193,-7.165409,-0.471101,-0.321,-0.1073,0.17,0.2137,-15.4813,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V5,"[True, True, False]",0.2137,WSe2_0.01562
2,619cdd5145851b2ed2430ec1,619cdd9644389dee486ade44,-1378.063994,-7.214995,-0.326761,-0.9632,0.3281,1.2913,1.2913,-61.773,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'substitution', 'from': 'W', 'to': '...",S6,"[True, True, False]",1.2913,WSe2_0.01562
3,619cdd5145851b2ed2430ec2,619cdd9644389dee486ade3e,-1361.542339,-7.166012,-0.554392,-0.3612,-0.1448,0.2713,0.2164,-15.4841,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",0.2164,WSe2_0.01562
4,619cdd5145851b2ed2430ec3,619cdd9644389dee486ade3e,-1361.539109,-7.165995,-0.51428,-0.3223,-0.1199,0.3755,0.2024,-15.4861,WSe2,"(8, 8, 1)",low_WSe2,"[{'type': 'vacancy', 'element': 'W'}, {'type':...",V6,"[True, True, False]",0.2024,WSe2_0.01562


# Remove unrequired columns and add others for uniformity

In [28]:
if "2" not in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_majority", "lumo_majority",
                                "homo_lumo_gap_majority","E_1_majority", "homo_minority", 
                                "lumo_minority", "homo_lumo_gap_minority", "E_1_minority",
                                "homo", "lumo", "description", "energy", "fermi_level", 
                                "E_1", "cell", "total_mag", "base"], axis=1)

elif "2" in mat_dataset and "high" in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap", 
                                "homo", "lumo", "description", "energy", 
                                "fermi_level", "E_1", "cell", "base"], axis=1)

elif "2" in mat_dataset and "low" in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap", 
                                "band_gap", "homo", "lumo", "description", 
                                "pbc", "energy", "fermi_level", "E_1", "cell",
                                "base", "energy_per_atom"], axis=1)

merged_df.head()

Unnamed: 0,_id,dataset_material,band_gap_value,to_strata
0,619cdd5145851b2ed2430ebf,low_WSe2,0.2722,WSe2_0.01562
1,619cdd5145851b2ed2430ec0,low_WSe2,0.2137,WSe2_0.01562
2,619cdd5145851b2ed2430ec1,low_WSe2,1.2913,WSe2_0.01562
3,619cdd5145851b2ed2430ec2,low_WSe2,0.2164,WSe2_0.01562
4,619cdd5145851b2ed2430ec3,low_WSe2,0.2024,WSe2_0.01562


In [30]:
# Ready the strata 
# Convert the values in the strata to integers to split effectively
unique_values = pd.unique(merged_df["to_strata"])
mapping = {value: i for i, value in enumerate(unique_values)}

merged_df["strata"] = merged_df["to_strata"].map(mapping)
merged_df = merged_df.drop(columns=["to_strata"])
merged_df.head()

Unnamed: 0,_id,dataset_material,band_gap_value,strata
0,619cdd5145851b2ed2430ebf,low_WSe2,0.2722,0
1,619cdd5145851b2ed2430ec0,low_WSe2,0.2137,0
2,619cdd5145851b2ed2430ec1,low_WSe2,1.2913,0
3,619cdd5145851b2ed2430ec2,low_WSe2,0.2164,0
4,619cdd5145851b2ed2430ec3,low_WSe2,0.2024,0


Next step is to turn this code into a python script in order to apply this process to all materials.

This is done through `combine.py`