In [1]:
# Install required libraries
!pip install tqdm

# Import necessary libraries
import os
import json
import random
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

element_symbols = [
    "H", "He",
    "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni",
    "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
    "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu",
    "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po", "At", "Rn",
    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu",
    "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No",
    "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
]

# Define ordering classes and their one-hot encoding
ordering_classes = ["FM", "NM", "FiM", "AFM"]
ordering_one_hot = {
    "FM": [1, 0, 0, 0],
    "NM": [0, 1, 0, 0],
    "FiM": [0, 0, 1, 0],
    "AFM": [0, 0, 0, 1]
}

# Define source and target paths
SOURCE_FILE = '/content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json'
TARGET_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/additional_datasets'
TARGET_FILE = 'labeled_empirical_data_f2_p2.json'
TARGET_PATH = os.path.join(TARGET_DIR, TARGET_FILE)

# Create the target directory if it doesn't exist
if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)
    print(f"✅ Created target directory: {TARGET_DIR}")
else:
    print(f"✅ Target directory already exists: {TARGET_DIR}")

# Define a function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"❌ JSON decode error in file: {file_path}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error loading {file_path}: {e}")
        return []

# Define a function to save JSON data
def save_json(data, file_path):
    try:
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"✅ Saved {len(data)} records to {file_path}")
    except Exception as e:
        print(f"❌ Error saving to {file_path}: {e}")

def encode_composition(composition_reduced):
    composition_encoded = []
    for element in element_symbols:
        count = composition_reduced.get(element, 0)
        # If count is not an integer or float, treat it as 0.
        if not isinstance(count, (int, float)):
            count = 0
        composition_encoded.append(count)

    # Assert that we have exactly 118 elements after encoding.
    assert len(composition_encoded) == 118, (
        f"composition_encoded length is {len(composition_encoded)}, "
        f"expected 118. Please verify your element_symbols list and data processing."
    )
    return composition_encoded

def encode_ordering(ordering):
    encoded = ordering_one_hot.get(ordering, [0, 0, 0, 0])
    if encoded == [0, 0, 0, 0] and ordering not in ordering_one_hot:
        print(f"⚠️ Unrecognized ordering class: {ordering}. Encoding as all zeros.")
    return encoded

# Process the single source file
print(f"\n🔄 Processing file: {SOURCE_FILE}")
data = load_json(SOURCE_FILE)

if not data:
    print(f"❌ No data loaded from {SOURCE_FILE}, cannot process.")
else:
    processed_data = []
    for record in tqdm(data, desc=f"Processing {SOURCE_FILE}"):
        # Extract required fields
        formula_pretty = record.get('formula_pretty', "")
        composition_reduced = record.get('composition_reduced', {})
        density = record.get('density', None)
        ordering = record.get('ordering', "")

        # Encode composition
        composition_encoded = encode_composition(composition_reduced)

        # Encode ordering
        ordering_encoded = encode_ordering(ordering)

        new_record = {
            "formula_pretty": formula_pretty,
            "composition_reduced": composition_reduced,
            "composition_encoded": composition_encoded,
            "density": density,
            "ordering": ordering,
            "ordering_encoded": ordering_encoded
        }

        processed_data.append(new_record)

    # Save the processed data to the target file
    save_json(processed_data, TARGET_PATH)

print("\n🎉 One-Hot Encoding and Data Transformation Complete!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Target directory already exists: /content/drive/MyDrive/Colab Notebooks/Datasets/additional_datasets

🔄 Processing file: /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json
✅ Loaded 48649 records from /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json


Processing /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json: 100%|██████████| 48649/48649 [00:01<00:00, 29620.04it/s]


✅ Saved 48649 records to /content/drive/MyDrive/Colab Notebooks/Datasets/additional_datasets/labeled_empirical_data_f2_p2.json

🎉 One-Hot Encoding and Data Transformation Complete!


In [2]:
# Install required libraries
!pip install tqdm

# Import necessary libraries
import os
import json
import numpy as np
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

element_symbols = [
    "H", "He",
    "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni",
    "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
    "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu",
    "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po", "At", "Rn",
    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu",
    "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No",
    "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
]

# Define ordering classes and their one-hot encoding
ordering_one_hot = {
    "FM": [1, 0, 0, 0],
    "NM": [0, 1, 0, 0],
    "FiM": [0, 0, 1, 0],
    "AFM": [0, 0, 0, 1]
}

# Input and output paths
SOURCE_FILE = '/content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json'
PHASE_3_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data'

# Create the phase 3 directory if it doesn't exist
if not os.path.exists(PHASE_3_DIR):
    os.makedirs(PHASE_3_DIR)
    print(f"✅ Created directory: {PHASE_3_DIR}")
else:
    print(f"✅ Directory already exists: {PHASE_3_DIR}")

X_OUTPUT_FILE = os.path.join(PHASE_3_DIR, 'X_train_empirical.npy')
Y_OUTPUT_FILE = os.path.join(PHASE_3_DIR, 'Y_train_empirical.npy')

# Define a function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"❌ JSON decode error in file: {file_path}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error loading {file_path}: {e}")
        return []

def encode_composition(composition_reduced):
    composition_encoded = []
    for element in element_symbols:
        count = composition_reduced.get(element, 0)
        # If count is not an integer or float, set to 0
        if not isinstance(count, (int, float)):
            count = 0
        composition_encoded.append(count)

    # Verify length is correct
    assert len(composition_encoded) == 118, (
        f"composition_encoded length is {len(composition_encoded)}, expected 118."
    )
    return composition_encoded

def encode_ordering(ordering):
    encoded = ordering_one_hot.get(ordering, [0, 0, 0, 0])
    return encoded

# Load the data
data = load_json(SOURCE_FILE)

if not data:
    print(f"❌ No data loaded from {SOURCE_FILE}, cannot process.")
else:
    X_list = []
    Y_list = []

    for record in tqdm(data, desc=f"Processing {SOURCE_FILE}"):
        composition_reduced = record.get('composition_reduced', {})
        ordering = record.get('ordering', "")
        density = record.get('density', 0.0)  # Default to 0.0 if 'density' is missing

        # Encode composition and ordering
        composition_encoded = encode_composition(composition_reduced)
        ordering_encoded = encode_ordering(ordering)

        # Validate and append density
        if isinstance(density, (int, float)):
            composition_encoded.append(density)
        else:
            # Handle invalid density values by setting to 0.0 or another appropriate default
            composition_encoded.append(0.0)

        # Now, composition_encoded has 119 elements
        assert len(composition_encoded) == 119, (
            f"Feature vector length is {len(composition_encoded)}, expected 119."
        )

        # Append to lists
        X_list.append(composition_encoded)
        Y_list.append(ordering_encoded)

    # Convert to NumPy arrays
    X_array = np.array(X_list, dtype=np.float32)
    Y_array = np.array(Y_list, dtype=np.float32)

    # Save the arrays
    np.save(X_OUTPUT_FILE, X_array)
    np.save(Y_OUTPUT_FILE, Y_array)

    print(f"✅ Saved X to {X_OUTPUT_FILE} with shape {X_array.shape}")
    print(f"✅ Saved Y to {Y_OUTPUT_FILE} with shape {Y_array.shape}")
    print("\n🎉 Conversion to NumPy arrays and saving complete!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Directory already exists: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data
✅ Loaded 48649 records from /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json


Processing /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json: 100%|██████████| 48649/48649 [00:03<00:00, 13210.56it/s]


✅ Saved X to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/X_train_empirical.npy with shape (48649, 119)
✅ Saved Y to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Y_train_empirical.npy with shape (48649, 4)

🎉 Conversion to NumPy arrays and saving complete!
