In [1]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    original_data = '/content/drive/My Drive/original_dataset'
    final_data = '/content/drive/My Drive/final_dataset'

    # Install required packages
    !pip install pymatgen

else:
    original_data = 'original_dataset'
    final_data = 'final_dataset'

In [2]:
import pandas as pd
from pymatgen.core import Structure

In [3]:
# Focus on a specific material
mat_dataset = "high_InSe"
parts = mat_dataset.split("_")
the_material = parts[1]

# Load the data to df
defects_df = pd.read_csv(f"{original_data}/{mat_dataset}/defects.csv")
description_df = pd.read_csv(f"{original_data}/{mat_dataset}/descriptors.csv")
structure_df = pd.read_csv(f"{original_data}/initial_structures.csv")
elements_df = pd.read_csv(f"{original_data}/elements.csv")

## Prepare the descriptor df

In [4]:
# Change the column name of the descriptor id column
description_df = description_df.rename(columns={"_id": "descriptor_id"})

# Clearly specify the base for future stratification
description_df["dataset_material"] = mat_dataset

description_df.head()

Unnamed: 0,descriptor_id,description,base,cell,defects,dataset_material
0,7636e0f2-ebef-43f6-b535-a5aa4526cc10,In70Se71,InSe,"[6, 6, 1]","[{'type': 'vacancy', 'element': 'Se'}, {'type'...",high_InSe
1,3bc213d4-2cc7-4901-a5d9-d6193b4542b9,In70Ga2Se71S1,InSe,"[6, 6, 1]","[{'type': 'substitution', 'from': 'In', 'to': ...",high_InSe
2,c2119a7c-6d8d-48f8-9250-49d8cd503138,In71Ga1Se70S1,InSe,"[6, 6, 1]","[{'type': 'substitution', 'from': 'In', 'to': ...",high_InSe
3,b578b61e-bcab-4484-b001-b144462d9961,In70Ga1Se71,InSe,"[6, 6, 1]","[{'type': 'vacancy', 'element': 'Se'}, {'type'...",high_InSe
4,ec4a3ea8-1f91-4c6e-963a-f87c5cba6e49,In70Se71S1,InSe,"[6, 6, 1]","[{'type': 'vacancy', 'element': 'In'}, {'type'...",high_InSe


## Clearly represent defects

In [5]:
# Clearly represent the defects in the description_df
def string_to_sites(a_column):
    # Remove unwanted chars
    unwanted_chars = ['[',']']
    for i in unwanted_chars:
        a_column = a_column.replace(i,"")

    # Create a list of the different types of defects
    types = a_column.split("}")
    new_types = [j + "}" for j in types]

    # Remove the additional "{" at the end of the list
    del new_types[-1]

    # Remove the " ," before the "{"
    new_new_types = [types.lstrip(" ,") for types in new_types]

    # Defects clearly represented in 
    list_of_dicts = [eval(dict_string) for dict_string in new_new_types]

    list_of_defects = []
    for i in list_of_dicts:
        if i["type"] == "vacancy":
            defect = f'vacant_{i["element"]}'
            list_of_defects.append(defect)

        elif i["type"] == "substitution":
            defect = f'sub_{i["from"]}_{i["to"]}'
            list_of_defects.append(defect)

        else:
            list_of_defects.append("ubnormal")

    # Create a dictionary of defect_type: number_of_sites
    the_dict = {defect: list_of_defects.count(defect) for defect in list_of_defects}

    return the_dict


def string_to_columns(row):
    dict_defects = string_to_sites(row["defects"])

    for i,j in dict_defects.items():
        row[i] = j

    row.fillna(0.0, inplace=True)
    return row


description_df = description_df.apply(lambda row: string_to_columns(row), axis= 1).fillna(0)
description_df.head()

Unnamed: 0,base,cell,dataset_material,defects,description,descriptor_id,sub_In_Ga,sub_Se_S,vacant_In,vacant_Se
0,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Se71,7636e0f2-ebef-43f6-b535-a5aa4526cc10,0.0,0.0,2.0,1.0
1,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In70Ga2Se71S1,3bc213d4-2cc7-4901-a5d9-d6193b4542b9,2.0,1.0,0.0,0.0
2,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In71Ga1Se70S1,c2119a7c-6d8d-48f8-9250-49d8cd503138,1.0,1.0,0.0,1.0
3,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Ga1Se71,b578b61e-bcab-4484-b001-b144462d9961,1.0,0.0,1.0,1.0
4,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'In'}, {'type'...",In70Se71S1,ec4a3ea8-1f91-4c6e-963a-f87c5cba6e49,0.0,1.0,2.0,0.0


## Merge the `descriptor.csv` and the `defects.csv` and modify...

In [6]:
# Add description to defects df
merged_df = defects_df.merge(description_df, on="descriptor_id", how="left")

merged_df.head()

Unnamed: 0,_id,descriptor_id,energy,fermi_level,total_mag,homo_lumo_gap_majority,lumo_majority,homo_majority,E_1_majority,homo_lumo_gap_minority,...,E_1_minority,base,cell,dataset_material,defects,description,sub_In_Ga,sub_Se_S,vacant_In,vacant_Se
0,InSe_In70Se71_72e0e514-b756-42a9-b3f2-2b1e00c1...,7636e0f2-ebef-43f6-b535-a5aa4526cc10,-506.299074,-1.500321,0.000268,0.6107,-1.1958,-1.8065,-17.041,0.6107,...,-17.041,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Se71,0.0,0.0,2.0,1.0
1,InSe_In70Ga2Se71S1_26bbfd31-5bf6-4567-aa7a-1d2...,3bc213d4-2cc7-4901-a5d9-d6193b4542b9,-524.111201,-1.63243,0.002046,1.3139,-0.8437,-2.1576,-16.4969,1.3139,...,-16.4969,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In70Ga2Se71S1,2.0,1.0,0.0,0.0
2,InSe_In71Ga1Se70S1_62c52d3d-e0c4-48ee-a9ed-6f6...,c2119a7c-6d8d-48f8-9250-49d8cd503138,-518.116026,-1.385209,0.002178,1.1354,-0.7748,-1.9102,-16.4856,1.1354,...,-16.4856,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In71Ga1Se70S1,1.0,1.0,0.0,1.0
3,InSe_In70Ga1Se71_0cf6df90-f056-4dbf-a63a-6660e...,b578b61e-bcab-4484-b001-b144462d9961,-511.864936,-1.84245,0.202513,1.0517,-0.8206,-1.8723,-16.9629,1.0461,...,-16.963,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Ga1Se71,1.0,0.0,1.0,1.0
4,InSe_In70Se71S1_cf386f02-a025-440c-b521-e10b6c...,ec4a3ea8-1f91-4c6e-963a-f87c5cba6e49,-511.991292,-2.021991,1.770915,1.2453,-0.8978,-2.1431,-17.0228,1.1244,...,-17.023,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'In'}, {'type'...",In70Se71S1,0.0,1.0,2.0,0.0


### Add `band_gap_value`

In [7]:
# Help us get the target band gap value
def remove_majmin(row):
    row["homo"] = (row["homo_majority"] + row["homo_minority"])/2
    row["lumo"] = (row["lumo_majority"] + row["lumo_minority"])/2
    row["E_1"] = (row["E_1_majority"] + row["E_1_minority"])/2

    return row


def get_bgv(row, structure_df, base):
    E_1_pristine = structure_df.loc[structure_df["base"] == base, "E_1"].iloc[0]
    E_vbm_pristine = structure_df.loc[structure_df["base"] == base, "E_VBM"].iloc[0]

    new_norm_homo = row["homo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)
    new_norm_lumo = row["lumo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)

    row["band_gap_value"] = new_norm_lumo - new_norm_homo

    return row


if "2" not in mat_dataset:
    merged_df = merged_df.apply(remove_majmin, axis= 1)
    merged_df = merged_df.apply(lambda row: get_bgv(row, structure_df, the_material), axis=1)

else:
    merged_df = merged_df.apply(lambda row: get_bgv(row,structure_df, the_material),axis=1)

merged_df.head()

Unnamed: 0,_id,descriptor_id,energy,fermi_level,total_mag,homo_lumo_gap_majority,lumo_majority,homo_majority,E_1_majority,homo_lumo_gap_minority,...,defects,description,sub_In_Ga,sub_Se_S,vacant_In,vacant_Se,homo,lumo,E_1,band_gap_value
0,InSe_In70Se71_72e0e514-b756-42a9-b3f2-2b1e00c1...,7636e0f2-ebef-43f6-b535-a5aa4526cc10,-506.299074,-1.500321,0.000268,0.6107,-1.1958,-1.8065,-17.041,0.6107,...,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Se71,0.0,0.0,2.0,1.0,-1.8065,-1.1958,-17.041,0.6107
1,InSe_In70Ga2Se71S1_26bbfd31-5bf6-4567-aa7a-1d2...,3bc213d4-2cc7-4901-a5d9-d6193b4542b9,-524.111201,-1.63243,0.002046,1.3139,-0.8437,-2.1576,-16.4969,1.3139,...,"[{'type': 'substitution', 'from': 'In', 'to': ...",In70Ga2Se71S1,2.0,1.0,0.0,0.0,-2.1576,-0.8437,-16.4969,1.3139
2,InSe_In71Ga1Se70S1_62c52d3d-e0c4-48ee-a9ed-6f6...,c2119a7c-6d8d-48f8-9250-49d8cd503138,-518.116026,-1.385209,0.002178,1.1354,-0.7748,-1.9102,-16.4856,1.1354,...,"[{'type': 'substitution', 'from': 'In', 'to': ...",In71Ga1Se70S1,1.0,1.0,0.0,1.0,-1.9102,-0.7748,-16.4856,1.1354
3,InSe_In70Ga1Se71_0cf6df90-f056-4dbf-a63a-6660e...,b578b61e-bcab-4484-b001-b144462d9961,-511.864936,-1.84245,0.202513,1.0517,-0.8206,-1.8723,-16.9629,1.0461,...,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Ga1Se71,1.0,0.0,1.0,1.0,-1.86915,-0.82025,-16.96295,1.0489
4,InSe_In70Se71S1_cf386f02-a025-440c-b521-e10b6c...,ec4a3ea8-1f91-4c6e-963a-f87c5cba6e49,-511.991292,-2.021991,1.770915,1.2453,-0.8978,-2.1431,-17.0228,1.1244,...,"[{'type': 'vacancy', 'element': 'In'}, {'type'...",In70Se71S1,0.0,1.0,2.0,0.0,-2.08,-0.89515,-17.0229,1.18485


### Add number of defect sites

In [8]:
# Add strata and ref column
ref_sites_dict = {}

reference_structure = Structure.from_file(f"{final_data}/ref_cifs/{mat_dataset}.cif")

# Add number of reference sites to the dictionary
ref_sites_dict[mat_dataset] = reference_structure.num_sites

# Get defect sites that will help in creating the strata column
def get_strata(row, ref_sites_dict):
    # Get the defects in the df
    all_columns = list(row.index)  
    vacant_columns = [col for col in all_columns if "vacant" in col]
    sub_columns = [col for col in all_columns if "sub" in col]
    
    # Get defect:site pair
    vacant_dict = {i:row[i] for i in vacant_columns}
    # row["vacancy_sites"] = sum(vacant_dict.values())
    vacants = sum(vacant_dict.values())

    sub_dict = {i:row[i] for i in sub_columns}
    # row["substitution_sites"] = sum(sub_dict.values())
    subs = sum(sub_dict.values())

    # Get total defect sites
    # row["defect_sites"] = row["vacancy_sites"]+row["substitution_sites"]
    defect_sites = vacants + subs

    total_num_sites = ref_sites_dict[row["dataset_material"]]

    # Get defect concentration
    defect_conc = round(defect_sites/total_num_sites,5)

    # The strata column will be in the form of material type_defect concentration
    row["to_strata"] = f"{row['base']}_{defect_conc}"
    return row



# Replace the specific defect sites with type of defect sites
merged_df = merged_df.apply(lambda row: get_strata(row, ref_sites_dict), axis=1)
merged_df = merged_df.drop(columns=[col for col in merged_df.columns if "vacant_" in col or "sub_" in col])
merged_df.head()

Unnamed: 0,_id,descriptor_id,energy,fermi_level,total_mag,homo_lumo_gap_majority,lumo_majority,homo_majority,E_1_majority,homo_lumo_gap_minority,...,base,cell,dataset_material,defects,description,homo,lumo,E_1,band_gap_value,to_strata
0,InSe_In70Se71_72e0e514-b756-42a9-b3f2-2b1e00c1...,7636e0f2-ebef-43f6-b535-a5aa4526cc10,-506.299074,-1.500321,0.000268,0.6107,-1.1958,-1.8065,-17.041,0.6107,...,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Se71,-1.8065,-1.1958,-17.041,0.6107,InSe_0.02083
1,InSe_In70Ga2Se71S1_26bbfd31-5bf6-4567-aa7a-1d2...,3bc213d4-2cc7-4901-a5d9-d6193b4542b9,-524.111201,-1.63243,0.002046,1.3139,-0.8437,-2.1576,-16.4969,1.3139,...,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In70Ga2Se71S1,-2.1576,-0.8437,-16.4969,1.3139,InSe_0.02083
2,InSe_In71Ga1Se70S1_62c52d3d-e0c4-48ee-a9ed-6f6...,c2119a7c-6d8d-48f8-9250-49d8cd503138,-518.116026,-1.385209,0.002178,1.1354,-0.7748,-1.9102,-16.4856,1.1354,...,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'substitution', 'from': 'In', 'to': ...",In71Ga1Se70S1,-1.9102,-0.7748,-16.4856,1.1354,InSe_0.02083
3,InSe_In70Ga1Se71_0cf6df90-f056-4dbf-a63a-6660e...,b578b61e-bcab-4484-b001-b144462d9961,-511.864936,-1.84245,0.202513,1.0517,-0.8206,-1.8723,-16.9629,1.0461,...,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'Se'}, {'type'...",In70Ga1Se71,-1.86915,-0.82025,-16.96295,1.0489,InSe_0.02083
4,InSe_In70Se71S1_cf386f02-a025-440c-b521-e10b6c...,ec4a3ea8-1f91-4c6e-963a-f87c5cba6e49,-511.991292,-2.021991,1.770915,1.2453,-0.8978,-2.1431,-17.0228,1.1244,...,InSe,"[6, 6, 1]",high_InSe,"[{'type': 'vacancy', 'element': 'In'}, {'type'...",In70Se71S1,-2.08,-0.89515,-17.0229,1.18485,InSe_0.02083


# Remove unrequired columns and add others for uniformity

In [9]:
if "2" not in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_majority", "lumo_majority",
                                "homo_lumo_gap_majority","E_1_majority", "homo_minority", 
                                "lumo_minority", "homo_lumo_gap_minority", "E_1_minority",
                                "homo", "lumo", "description", "energy", "fermi_level", 
                                "E_1", "cell", "total_mag", "base"], axis=1)

elif "2" in mat_dataset and "high" in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap", 
                                "homo", "lumo", "description", "energy", 
                                "fermi_level", "E_1", "cell", "base"], axis=1)

elif "2" in mat_dataset and "low" in mat_dataset:
    merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap", 
                                "band_gap", "homo", "lumo", "description", 
                                "pbc", "energy", "fermi_level", "E_1", "cell",
                                "base", "energy_per_atom"], axis=1)

merged_df.head()

Unnamed: 0,_id,dataset_material,band_gap_value,to_strata
0,InSe_In70Se71_72e0e514-b756-42a9-b3f2-2b1e00c1...,high_InSe,0.6107,InSe_0.02083
1,InSe_In70Ga2Se71S1_26bbfd31-5bf6-4567-aa7a-1d2...,high_InSe,1.3139,InSe_0.02083
2,InSe_In71Ga1Se70S1_62c52d3d-e0c4-48ee-a9ed-6f6...,high_InSe,1.1354,InSe_0.02083
3,InSe_In70Ga1Se71_0cf6df90-f056-4dbf-a63a-6660e...,high_InSe,1.0489,InSe_0.02083
4,InSe_In70Se71S1_cf386f02-a025-440c-b521-e10b6c...,high_InSe,1.18485,InSe_0.02083


In [10]:
# Ready the strata using a function
# Convert the values in the strata to integers to split effectively
def get_strata(merged_df):
    unique_values = pd.unique(merged_df["to_strata"])
    mapping = {value: i for i, value in enumerate(unique_values)}

    merged_df["strata"] = merged_df["to_strata"].map(mapping)
    merged_df = merged_df.drop(columns=["to_strata"])
    return merged_df

new_df = get_strata(merged_df)
new_df.head()

Unnamed: 0,_id,dataset_material,band_gap_value,strata
0,InSe_In70Se71_72e0e514-b756-42a9-b3f2-2b1e00c1...,high_InSe,0.6107,0
1,InSe_In70Ga2Se71S1_26bbfd31-5bf6-4567-aa7a-1d2...,high_InSe,1.3139,0
2,InSe_In71Ga1Se70S1_62c52d3d-e0c4-48ee-a9ed-6f6...,high_InSe,1.1354,0
3,InSe_In70Ga1Se71_0cf6df90-f056-4dbf-a63a-6660e...,high_InSe,1.0489,0
4,InSe_In70Se71S1_cf386f02-a025-440c-b521-e10b6c...,high_InSe,1.18485,0


Next step is to turn this code into a python script in order to apply this process to all materials.

This is done through `combine.py`