In [1]:
import sys
if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    original_data = '/content/drive/My Drive/original_dataset'
    final_data = '/content/drive/My Drive/Final_Dataset'

    # Install required packages
    !pip install pymatgen

else:
    original_data = 'original_dataset'
    final_data = 'Final_Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from pymatgen.core import Structure
import ast

In [3]:
# Standard df

init_structure_df = pd.read_csv(f"{original_data}/initial_structures.csv")
elements_df = pd.read_csv(f"{original_data}/elements.csv")

# For testing
# mat_dataset = "low_MoS2"

## Get all the reference structures

In [4]:
def get_reference_structures(mat_dataset, testing=False):
    # The material
    base_material = mat_dataset.split("_")[1]

    # The unit cell structure
    unit_structure = Structure.from_file(f"{original_data}/{mat_dataset}/{base_material}.cif")

    # The cell matrix
    cell_matrix = init_structure_df.loc[init_structure_df["base"] == base_material, "cell_size"].iloc[0]
    cell_matrix = ast.literal_eval(cell_matrix)

    # Create the reference structure
    ref_structure = unit_structure.make_supercell(cell_matrix)

    ref_num_sites = ref_structure.num_sites

    if not testing:
        ref_structure.to(f"{final_data}/ref_cifs/{mat_dataset}.cif")

    return ref_structure, ref_num_sites

# Test
# ref_struct, ref_num = get_reference_structures(mat_dataset, testing=True)

## Prepare dataframes required

In [5]:
def get_df(mat_dataset):
    # mat_dataset = "high_InSe"
    parts = mat_dataset.split("_")
    the_material = parts[1]

    # Load the data to df
    defects_df = pd.read_csv(f"{original_data}/{mat_dataset}/defects.csv")
    description_df = pd.read_csv(f"{original_data}/{mat_dataset}/descriptors.csv")

    # Prepare descrition_df
    description_df = description_df.rename(columns={"_id": "descriptor_id"})

    # Clearly specify the base for future stratification
    description_df["dataset_material"] = mat_dataset

    return defects_df, description_df, the_material

# Test
# defects_df, description_df, the_material = get_df(mat_dataset)
# description_df.head()

## Clearly represent defects

In [6]:
# Clearly represent the defects in the description_df
def string_to_dict(defects_string: str) -> dict:
    # Remove unwanted chars
    unwanted_chars = ['[',']']
    for i in unwanted_chars:
        defects_string = defects_string.replace(i,"")

    # Create a list of the different types of defects
    types = defects_string.split("}")
    new_types = [j + "}" for j in types]

    # Remove the additional "{" at the end of the list
    del new_types[-1]

    # Remove the " ," before the "{"
    new_new_types = [types.lstrip(" ,") for types in new_types]

    # Defects clearly represented in
    list_of_dicts = [eval(dict_string) for dict_string in new_new_types]

    list_of_defects = []
    for i in list_of_dicts:
        if i["type"] == "vacancy":
            defect = f'vacant_{i["element"]}'
            list_of_defects.append(defect)

        elif i["type"] == "substitution":
            defect = f'sub_{i["from"]}_{i["to"]}'
            list_of_defects.append(defect)

        else:
            list_of_defects.append("ubnormal")

    # Create a dictionary of defect_type: number_of_sites
    the_dict = {defect: list_of_defects.count(defect) for defect in list_of_defects}

    return the_dict


def dict_to_columns(row):
    dict_defects = string_to_dict(row["defects"])

    for i,j in dict_defects.items():
        row[i] = j

    row.fillna(0.0, inplace=True)
    return row


# Test
# description_df = description_df.apply(lambda row: dict_to_columns(row), axis= 1).fillna(0)
# description_df.head()

## Merge the `descriptor.csv` and the `defects.csv`

In [7]:
# Add description to defects df
# merged_df = defects_df.merge(description_df, on="descriptor_id", how="left")

# merged_df.head()

## Add strata and ref column

In [8]:
# Get defect sites that will help in creating the strata column
def get_to_strata(row, ref_num_sites):
    # Get the defects in the df
    all_columns = list(row.index)
    vacant_columns = [col for col in all_columns if "vacant" in col]
    sub_columns = [col for col in all_columns if "sub" in col]

    # Get defect:site pair
    vacant_dict = {i:row[i] for i in vacant_columns}
    vacants = sum(vacant_dict.values())

    sub_dict = {i:row[i] for i in sub_columns}
    subs = sum(sub_dict.values())

    # Get total defect sites
    defect_sites = vacants + subs


    # Get defect concentration
    defect_conc = round(defect_sites/ref_num_sites,5)

    # Other valuable columns
    row["vacancy_sites"] = vacants
    row["substitution_sites"] = subs
    row["defect_sites"] = defect_sites


    # The strata column will be in the form of material type_defect concentration
    row["to_strata"] = f"{row['base']}_{defect_conc}"
    return row


# Test
# Replace the specific defect sites with type of defect sites
# merged_df = merged_df.apply(lambda row: get_to_strata(row, ref_num), axis=1)
# merged_df = merged_df.drop(columns=[col for col in merged_df.columns if "vacant_" in col or "sub_" in col])
# merged_df.head()

### Add `band_gap_value`

In [9]:
# get the target band gap value
def remove_majmin(row):
    row["homo"] = (row["homo_majority"] + row["homo_minority"])/2
    row["lumo"] = (row["lumo_majority"] + row["lumo_minority"])/2
    row["E_1"] = (row["E_1_majority"] + row["E_1_minority"])/2

    return row


def get_bgv(row, base):
    E_1_pristine = init_structure_df.loc[init_structure_df["base"] == base, "E_1"].iloc[0]
    E_vbm_pristine = init_structure_df.loc[init_structure_df["base"] == base, "E_VBM"].iloc[0]

    new_norm_homo = row["homo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)
    new_norm_lumo = row["lumo"] - row["E_1"] - (E_vbm_pristine - E_1_pristine)

    row["band_gap_value"] = new_norm_lumo - new_norm_homo

    return row


'''if "2" not in mat_dataset:
    merged_df = merged_df.apply(remove_majmin, axis= 1)
    merged_df = merged_df.apply(lambda row: get_bgv(row, the_material), axis=1)

else:
    merged_df = merged_df.apply(lambda row: get_bgv(row, the_material),axis=1)

merged_df.head()'''

'if "2" not in mat_dataset:\n    merged_df = merged_df.apply(remove_majmin, axis= 1)\n    merged_df = merged_df.apply(lambda row: get_bgv(row, the_material), axis=1)\n\nelse:\n    merged_df = merged_df.apply(lambda row: get_bgv(row, the_material),axis=1)\n\nmerged_df.head()'

In [10]:
materials = ["high_BN", "high_P", "high_InSe", "high_GaSe", "high_MoS2", "high_WSe2", "low_MoS2", "low_WSe2"]

for i in materials:
    # Get reference structure
    reference_structure, ref_num_sites = get_reference_structures(i)

    defects_df, description_df, the_material = get_df(i)

    # Clearly represent the defects in the description_df
    description_df = description_df.apply(lambda row: dict_to_columns(row), axis= 1).fillna(0)

    # Add description to defects df
    merged_df = defects_df.merge(description_df, on="descriptor_id", how="left")

    # Replace the specific defect sites with type of defect sites
    merged_df = merged_df.apply(lambda row: get_to_strata(row, ref_num_sites), axis=1)
    merged_df = merged_df.drop(columns=[col for col in merged_df.columns if "vacant_" in col or "sub_" in col])

    # Clean the merged data
    # Target
    if "2" not in i:
        merged_df = merged_df.apply(remove_majmin, axis= 1)
        merged_df = merged_df.apply(lambda row: get_bgv(row, the_material), axis=1)

        merged_df = merged_df.drop(["defects", "descriptor_id", "homo_majority", "lumo_majority",
                                    "homo_lumo_gap_majority","E_1_majority", "homo_minority",
                                    "lumo_minority", "homo_lumo_gap_minority", "E_1_minority",
                                    "homo", "lumo", "description", "energy", "fermi_level",
                                    "E_1", "cell", "total_mag", "base"], axis=1)

    else:
        merged_df = merged_df.apply(lambda row: get_bgv(row, the_material),axis=1)
        if "high" in i:
            merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap",
                                    "homo", "lumo", "description", "energy",
                                    "fermi_level", "E_1", "cell", "base"], axis=1)

        elif "low" in i:
            merged_df = merged_df.drop(["defects", "descriptor_id", "homo_lumo_gap",
                                    "band_gap", "homo", "lumo", "description",
                                    "pbc", "energy", "fermi_level", "E_1", "cell",
                                    "base", "energy_per_atom"], axis=1)

    # Return the new df as csv
    new_csv_file = f"{final_data}/combined/{i}.csv"
    merged_df.to_csv(new_csv_file, index=False)

  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)


In [11]:
# Convert the values in the strata to integers to split effectively
def get_strata(merged_df):
    unique_values = pd.unique(merged_df["to_strata"])
    mapping = {value: i for i, value in enumerate(unique_values)}

    merged_df["strata"] = merged_df["to_strata"].map(mapping)
    merged_df = merged_df.drop(columns=["to_strata"])
    return merged_df


all_df = [pd.read_csv(f"{final_data}/combined/{material}.csv") for material in materials]
merged = pd.concat(all_df, ignore_index=True)

# Get strata
comb_df = get_strata(merged)
comb_df.head()

comb_df.to_csv(f"{final_data}/combined/combined_data.csv", index=False)
