<a href="https://colab.research.google.com/github/AdamKimhub/Msproject1/blob/colab/forfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_path = '/content/drive/My Drive/dataset'

    # Install required packages
    !pip install pymatgen torch_geometric
    import torch
    from torch_geometric.data import Data

else:
    dataset_path = 'dataset'

In [None]:
from pathlib import Path
import ast
import numpy as np
import pandas as pd
from pymatgen.core import Structure, PeriodicSite, DummySpecie
from pymatgen.analysis.local_env import MinimumDistanceNN

import to_graph

## For Highly Concentrated Defects Datasets

In [None]:
high_dataset = ["high_BN", "high_GaSe", "high_InSe", "high_MoS2", "high_P", "high_WSe2"]
to_merge = [pd.read_csv(f"{dataset_path}/combined/{high_data}.csv") for high_data in high_dataset]

high_df  = pd.concat(to_merge, ignore_index=True)

high_copy = high_df.copy()

In [None]:
high_copy = high_copy.drop(["_id", "base", "cell", "dataset_material", "fermi_level", "total_mag"], axis =1)
high_copy = high_copy.corr()
high_copy

## For Lowly Concentrated Defects Datasets

In [None]:
low_dataset = ["low_MoS2", "low_WSe2"]
to_merge = [pd.read_csv(f"{dataset_path}/combined/{low_data}.csv") for low_data in low_dataset]

low_df  = pd.concat(to_merge, ignore_index=True)

low_copy = low_df.copy()

In [None]:
low_copy = low_copy.drop(["_id", "base", "cell", "dataset_material", "fermi_level", "total_mag"], axis =1)
low_copy = low_copy.corr()
low_copy

## For High and Low Concentrations of Defects Datasets

In [None]:
# Read file
comb_df = pd.read_csv(f"{dataset_path}/combined/combined.csv")

In [None]:
comb_copy = comb_df.copy()
comb_copy = comb_copy.drop(["_id", "base", "cell", "dataset_material", "fermi_level", "total_mag"], axis =1)
comb_copy = comb_copy.corr()
comb_copy

## Data to graphs

In [None]:
# Add strata
datsets = ["high_BN", "high_GaSe", "high_InSe", "high_MoS2", "high_P", "high_WSe2","low_MoS2", "low_WSe2"]

ref_sites_dict = {}

for datset in datsets:
    mat_split = datset.split('_')
    the_base = mat_split[1]

    # Get reference structure
    ref_unit_cell = Structure.from_file(f"{dataset_path}/{datset}/{the_base}.cif")
    cell_source = pd.read_csv(f"{dataset_path}/initial_structures.csv")
    the_cell = cell_source.loc[cell_source["base"] == the_base, "cell_size"].iloc[0]
    reference_structure = ref_unit_cell.make_supercell(ast.literal_eval(the_cell))

    # Get number of ref sites
    ref_sites_dict[datset] = reference_structure.num_sites


def get_conc(row, ref_sites_dict):
    total_num_sites = ref_sites_dict[row["dataset_material"]]

    # Get defect conc
    defect_conc = round(row["defect_sites"]/total_num_sites,5)
    row["defect_concentration"] = defect_conc

    # Add material name
    row["to_strata"] = f"{row['base']}_{row['defect_concentration']}"
    return row

comb_df = comb_df.apply(lambda row: get_conc(row,ref_sites_dict), axis=1)

unique_values = pd.unique(comb_df["to_strata"])
mapping = {value: i for i, value in enumerate(unique_values)}

comb_df["strata"] = comb_df["to_strata"].map(mapping)

In [None]:
# How about i split the data here
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(comb_df, test_size=0.3, stratify=comb_df['strata'], random_state=42)

val_set, testing_set = train_test_split(test_set, test_size=0.5, stratify=test_set['strata'], random_state=42)

In [None]:

def graphy(row):
    defective_file_path = Path(f"{dataset_path}/{row['dataset_material']}/cifs/{row['_id']}.cif")
    defective_structure = Structure.from_file(defective_file_path)

    ref_file_path = Path(f"{dataset_path}/{row['dataset_material']}/{row['base']}.cif")
    ref_unit_cell = Structure.from_file(ref_file_path)
    the_cell = ast.literal_eval(row['cell'])
    reference_structure = ref_unit_cell.make_supercell(the_cell)

    defects_structure = to_graph.get_defects_structure(defective_structure, reference_structure)

    the_nodes, the_edges, the_edge_features = to_graph.get_nodes_edges(defects_structure)

    global_attributes = ["energy","fermi_level","total_mag","formation_energy",
                         "energy_per_atom","E_1","vacancy_sites", "substitution_sites",
                         "defect_sites", "defect_concentration"]
    
    global_features = [row[i] for i in global_attributes]

    target_attribute = "band_gap_value"
    target_features = [row[target_attribute]]

    the_data = Data(x=torch.tensor(the_nodes, dtype=torch.float),
                    edge_index=torch.tensor(the_edges, dtype=torch.long),
                    edge_attr=torch.tensor(the_edge_features, dtype=torch.float),
                    u=torch.tensor(global_features, dtype=torch.float), 
                    y=torch.tensor(target_features, dtype=torch.float))
    return the_data

# samplex = samplex_df.apply(lambda row: graphy(row), axis = 1).tolist()

# Save the data before splitting
# torch.save(samplex, f"{dataset_path}/combined/all_graphs.pt")

# After splitting, turn them into graph data
training = train_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(training, f"{dataset_path}/combined/training.pt")
           
validating = val_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(validating, f"{dataset_path}/combined/validating.pt")

testing = testing_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(testing, f"{dataset_path}/combined/testing.pt")