<a href="https://colab.research.google.com/github/AdamKimhub/Msproject1/blob/main/forfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    original_data = '/content/drive/My Drive/original_dataset'
    final_data = '/content/drive/My Drive/final_dataset'

    # Install required packages
    !pip install pymatgen torch_geometric mp_api
    import torch
    from torch_geometric.data import Data

else:
    original_data = "original_dataset"
    final_data = "final_dataset"

    

In [2]:
import pandas as pd
from pymatgen.core import Structure
import to_graph
import combine

  from .autonotebook import tqdm as notebook_tqdm


## Data to graphs

In [None]:
# Combine all the dataframes
# Choose the materials you wish to train the model on.
materials = ["high_BN", "high_P", "high_InSe", "high_GaSe", "high_MoS2", "high_WSe2", "low_MoS2", "low_WSe2"]
all_df = [pd.read_csv(f"{final_data}/combined/{material}.csv") for material in materials]

merged = pd.concat(all_df, ignore_index=True)

# Get strata
comb_df = combine.get_strata(merged)
comb_df.head()

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(comb_df, test_size=0.35, stratify=comb_df['strata'], random_state=42)
test_set, val_set = train_test_split(test_set, test_size=0.5, random_state=42)

In [None]:
# Create graph representation of the structures
def graphy(row):
    defective_structure = Structure.from_file(f"{original_data}/{row["dataset_material"]}/cifs/{row["_id"]}.cif")
    reference_structure = Structure.from_file(f"{final_data}/ref_cifs/{row["dataset_material"]}.cif")

    defects_only_structure = to_graph.get_defects_structure(defective_structure, reference_structure)

    nodes, edges, edge_features, ids, ratios = to_graph.get_c_graph(defects_only_structure)

    target = train_set["band_gap_value"]

    the_data = Data(
        x=torch.tensor(nodes, dtype=torch.float),
        edge_index=torch.tensor(edges, dtype=torch.long),
        edge_attr=torch.tensor(edge_features, dtype=torch.float),
        the_ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0),
        the_ratios = torch.tensor(ratios, dtype=torch.float).unsqueeze(0),
        y=torch.tensor(target, dtype=torch.float).unsqueeze(0)
    )
    return the_data

# Turn each dataset into graph data and save it
training = train_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(training, f"{final_data}/combined/training.pt")

validating = val_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(validating, f"{final_data}/combined/validating.pt")

testing = test_set.apply(lambda row: graphy(row), axis = 1).tolist()
torch.save(testing, f"{final_data}/combined/testing.pt")