# Importing The Data from The Material Project Database using their API

In [None]:
# Install required libraries
!pip install tqdm

import requests
import json
from tqdm import tqdm
from google.colab import files

# Your Materials Project API key
API_KEY = 'UUNzWnc9qbnzxZxJ3UwzXN4On33FSMDF'

# Base URL for the Materials Project API
BASE_URL = 'https://api.materialsproject.org/materials/summary/'

# Headers with the API key
headers = {
    'X-API-KEY': API_KEY
}

def fetch_data(theoretical, max_pages, output_filename):
    """
    Fetch data from the Materials Project API.

    Parameters:
    - theoretical (bool): Whether to fetch theoretical data.
    - max_pages (int): Number of pages to fetch.
    - output_filename (str): Filename to save the JSON data.
    """
    all_data = []
    for page in tqdm(range(1, max_pages + 1), desc=f'Fetching {"theoretical" if theoretical else "empirical"} data'):
        params = {
            'theoretical': str(theoretical).lower(),
            'deprecated': 'false',
            '_page': page,
            '_per_page': 1000,
            '_skip': 0,
            '_limit': 1000,
            '_fields': 'formula_pretty,composition_reduced,density,ordering',
            '_all_fields': 'false',
            'license': 'BY-C'
        }

        try:
            response = requests.get(BASE_URL, headers=headers, params=params)
            response.raise_for_status()  # Raise an error for bad status codes
            json_response = response.json()

            # Ensure that 'data' is in the response
            if 'data' not in json_response:
                print(f"No 'data' field found in response on page {page}. Response content:")
                print(json_response)
                break

            data = json_response['data']

            if not data:
                print(f"No data found on page {page}. Ending fetch.")
                break

            all_data.extend(data)
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred on page {page}: {http_err}")
            break
        except Exception as err:
            print(f"An error occurred on page {page}: {err}")
            break

    # Save the collected data to a JSON file
    with open(output_filename, 'w') as f:
        json.dump(all_data, f, indent=4)
    print(f"Saved {len(all_data)} records to {output_filename}")

# Fetch empirical data (theoretical=false) up to page 49
fetch_data(theoretical=False, max_pages=49, output_filename='empirical_data_f2_p1.json')

# Fetch theoretical data (theoretical=true) up to page 105
fetch_data(theoretical=True, max_pages=105, output_filename='theoretical_data_f2_p1.json')

# Optional: Download the JSON files to your local machine
# Commented out to prevent automatic downloads during execution
# files.download('empirical_data_f2_p1.json')
# files.download('theoretical_data_f2_p1.json')




Fetching empirical data: 100%|██████████| 49/49 [00:57<00:00,  1.17s/it]


Saved 48884 records to empirical_data_f2_p1.json


Fetching theoretical data: 100%|██████████| 105/105 [01:45<00:00,  1.00s/it]


Saved 104351 records to theoretical_data_f2_p1.json


# Erasing unlabeled data or data labeled unknown

In [None]:
# Install required libraries (if not already installed)
!pip install tqdm

import json
from tqdm import tqdm
from google.colab import files




In [None]:
def process_json_file(input_filename, labeled_output_filename, unlabeled_list):
    """
    Processes a JSON file to separate labeled and unlabeled data points based on the 'ordering' field.

    Parameters:
    - input_filename (str): Path to the input JSON file.
    - labeled_output_filename (str): Path to save the labeled data.
    - unlabeled_list (list): A list to append unlabeled data points.

    Returns:
    - int: Number of data points processed.
    - int: Number of labeled data points.
    - int: Number of unlabeled data points.
    """
    with open(input_filename, 'r') as f:
        data = json.load(f)

    labeled_data = []
    unlabeled_data = []

    for entry in tqdm(data, desc=f'Processing {input_filename}'):
        ordering = entry.get('ordering', None)

        # Check if 'ordering' is "unknown", "None", or None (null)
        if ordering is None or ordering == "unknown" or ordering == "None":
            # Set 'ordering' to "unknown"
            entry['ordering'] = "unknown"
            unlabeled_data.append(entry)
        else:
            labeled_data.append(entry)

    # Append unlabeled data to the main unlabeled_list
    unlabeled_list.extend(unlabeled_data)

    # Save the labeled data to the labeled_output_filename
    with open(labeled_output_filename, 'w') as f:
        json.dump(labeled_data, f, indent=4)

    print(f"Processed {len(data)} records from {input_filename}:")
    print(f" - Labeled records: {len(labeled_data)}")
    print(f" - Unlabeled records: {len(unlabeled_data)}\n")

    return len(data), len(labeled_data), len(unlabeled_data)


In [None]:
# Initialize an empty list to hold all unlabeled data
unlabeled_f2_p1 = []


In [None]:
# Define file names
empirical_input = 'empirical_data_f2_p1.json'
empirical_labeled_output = 'labeled_empirical_data_f2_p1.json'

# Process the empirical data
process_json_file(empirical_input, empirical_labeled_output, unlabeled_f2_p1)


FileNotFoundError: [Errno 2] No such file or directory: 'empirical_data_f2_p1.json'

# Dividing the Data into Training, Training_Dev, Dev, and Testing datasets

In [None]:
# Install required libraries (if not already installed)
!pip install tqdm

# Import necessary libraries
import os
import json
import random
import shutil
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define source and target directories
SOURCE_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/RawData'
TARGET_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data'

# Create the target directory if it doesn't exist
if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)
    print(f"✅ Created target directory: {TARGET_DIR}")
else:
    print(f"✅ Target directory already exists: {TARGET_DIR}")

# Define filenames
# Source files
LABELED_EMPIRICAL = 'labeled_empirical_data_f2_p1.json'
LABELED_THEORETICAL = 'labeled_theoretical_data_f2_p1.json'

# Intermediate files
TRAINING_LABELED_EMPIRICAL = 'training_labeled_empirical_data_f2_p1.json'
LABELED_EMPIRICAL_THEORETICAL = 'labeled_empirical_theoretical_data_f2_p1.json'

# Output files
DEV_FILE = 'dev_f2_p1.json'
TEST_FILE = 'test_f2_p1.json'
TRAINING_DEV_FILE = 'training_dev_f2_p1.json'
TRAINING_FILE = 'training_f2_p1.json'

# Full paths for source files
empirical_path = os.path.join(SOURCE_DIR, LABELED_EMPIRICAL)
theoretical_path = os.path.join(SOURCE_DIR, LABELED_THEORETICAL)

# Full paths for intermediate and output files in target directory
training_labeled_empirical_path = os.path.join(TARGET_DIR, TRAINING_LABELED_EMPIRICAL)
labeled_empirical_theoretical_path = os.path.join(TARGET_DIR, LABELED_EMPIRICAL_THEORETICAL)

dev_path = os.path.join(TARGET_DIR, DEV_FILE)
test_path = os.path.join(TARGET_DIR, TEST_FILE)
training_dev_path = os.path.join(TARGET_DIR, TRAINING_DEV_FILE)
training_path = os.path.join(TARGET_DIR, TRAINING_FILE)

# Function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"❌ JSON decode error in file: {file_path}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error loading {file_path}: {e}")
        return []

# Function to save JSON data
def save_json(data, file_path):
    try:
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"✅ Saved {len(data)} records to {file_path}")
    except Exception as e:
        print(f"❌ Error saving to {file_path}: {e}")

# Function to randomly sample data
def sample_data(data, sample_size):
    return random.sample(data, sample_size)

# Function to split data into subsets
def split_data(data, dev_size, test_size):
    random.shuffle(data)
    dev_data = data[:dev_size]
    test_data = data[dev_size:dev_size + test_size]
    training_data = data[dev_size + test_size:]
    return dev_data, test_data, training_data

# Function to combine and shuffle datasets
def combine_and_shuffle(data1, data2):
    combined = data1 + data2
    random.shuffle(combined)
    return combined

# Seed for reproducibility
random.seed(42)

# Step 1: Load labeled_empirical_data_f2_p1.json
empirical_data = load_json(empirical_path)
if not empirical_data:
    raise ValueError("Empirical data could not be loaded. Please check the file and try again.")

# Check if there are at least 15,000 records
if len(empirical_data) < 15000:
    raise ValueError(f"Insufficient data in {LABELED_EMPIRICAL}. Required: 15,000, Available: {len(empirical_data)}")

# Step 2: Split empirical data into Dev, Test, and Training
print("\n🔄 Splitting empirical data into Dev, Test, and Training sets...")
dev_data, test_data, remaining_empirical = split_data(empirical_data, 7500, 7500)

# Save Dev and Test sets
save_json(dev_data, dev_path)
save_json(test_data, test_path)

# Save the remaining data to training_labeled_empirical_data_f2_p1.json
save_json(remaining_empirical, training_labeled_empirical_path)

# Step 3: Load labeled_theoretical_data_f2_p1.json
theoretical_data = load_json(theoretical_path)
if not theoretical_data:
    raise ValueError("Theoretical data could not be loaded. Please check the file and try again.")

# Step 4: Combine remaining empirical data with theoretical data
print("\n🔄 Combining remaining empirical data with theoretical data...")
combined_empirical_theoretical = combine_and_shuffle(remaining_empirical, theoretical_data)

# Save the combined data
save_json(combined_empirical_theoretical, labeled_empirical_theoretical_path)

# Step 5: Split combined data into Training Dev and Training sets
print("\n🔄 Splitting combined data into Training Dev and Training sets...")
if len(combined_empirical_theoretical) < 7500:
    raise ValueError(f"Insufficient combined data for Training Dev set. Required: 7,500, Available: {len(combined_empirical_theoretical)}")

training_dev_data = sample_data(combined_empirical_theoretical, 7500)
training_f2_p1_data = [record for record in combined_empirical_theoretical if record not in training_dev_data]

# Save Training Dev and Training sets
save_json(training_dev_data, training_dev_path)
save_json(training_f2_p1_data, training_path)

print("\n🎉 Data segmentation and organization complete!")

# Optional: Verify the results
def verify_counts():
    print("\n📊 Verification of dataset sizes:")
    print(f"Dev set: {len(dev_data)} records")
    print(f"Test set: {len(test_data)} records")
    print(f"Training (Empirical) set: {len(remaining_empirical)} records")
    print(f"Combined Empirical & Theoretical set: {len(combined_empirical_theoretical)} records")
    print(f"Training Dev set: {len(training_dev_data)} records")
    print(f"Training set: {len(training_f2_p1_data)} records")

verify_counts()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Target directory already exists: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data
✅ Loaded 48649 records from /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_empirical_data_f2_p1.json

🔄 Splitting empirical data into Dev, Test, and Training sets...
✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/dev_f2_p1.json
✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/test_f2_p1.json
✅ Saved 33649 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/training_labeled_empirical_data_f2_p1.json
✅ Loaded 103533 records from /content/drive/MyDrive/Colab Notebooks/Datasets/RawData/labeled_theoretical_data_f2_p1.json

🔄 Combining remaining empirical data with theoretical data...
✅ Saved 137182 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phas

# Using One_Hot encoding

In [3]:
# Install required libraries
!pip install tqdm

# Import necessary libraries
import os
import json
import random
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

element_symbols = [
    "H", "He",
    "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni",
    "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
    "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu",
    "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po", "At", "Rn",
    "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu",
    "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No",
    "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
]


# Define ordering classes and their one-hot encoding
ordering_classes = ["FM", "NM", "FiM", "AFM"]
ordering_one_hot = {
    "FM": [1, 0, 0, 0],
    "NM": [0, 1, 0, 0],
    "FiM": [0, 0, 1, 0],
    "AFM": [0, 0, 0, 1]
}

# Define source and target directories
SOURCE_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data'
TARGET_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data'

# Create the target directory if it doesn't exist
if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)
    print(f"✅ Created target directory: {TARGET_DIR}")
else:
    print(f"✅ Target directory already exists: {TARGET_DIR}")

# Define the four source files
source_files = [
    'dev_f2_p1.json',
    'test_f2_p1.json',
    'training_f2_p1.json',
    'training_dev_f2_p1.json'
]

# Define a function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"❌ JSON decode error in file: {file_path}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error loading {file_path}: {e}")
        return []

# Define a function to save JSON data
def save_json(data, file_path):
    try:
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"✅ Saved {len(data)} records to {file_path}")
    except Exception as e:
        print(f"❌ Error saving to {file_path}: {e}")

def encode_composition(composition_reduced):
    composition_encoded = []
    for element in element_symbols:
        count = composition_reduced.get(element, 0)
        # If count is not an integer or float, treat it as 0.
        # Removed the print warning to avoid console clutter.
        if not isinstance(count, (int, float)):
            count = 0
        composition_encoded.append(count)

    # Assert that we have exactly 118 elements after encoding.
    assert len(composition_encoded) == 118, (
        f"composition_encoded length is {len(composition_encoded)}, "
        f"expected 118. Please verify your element_symbols list and data processing."
    )

    return composition_encoded

# Define a function to encode ordering
def encode_ordering(ordering):
    encoded = ordering_one_hot.get(ordering, [0, 0, 0, 0])
    if encoded == [0, 0, 0, 0]:
        print(f"⚠️ Unrecognized ordering class: {ordering}. Encoding as all zeros.")
    return encoded

# Process each source file
for source_file in source_files:
    source_path = os.path.join(SOURCE_DIR, source_file)
    # Create the target file name by replacing 'p1' with 'p2'
    target_file = source_file.replace('p1', 'p2')
    target_path = os.path.join(TARGET_DIR, target_file)

    print(f"\n🔄 Processing file: {source_file}")

    # Load the data
    data = load_json(source_path)
    if not data:
        print(f"❌ Skipping file due to loading issues: {source_file}")
        continue

    processed_data = []

    # Process each record
    for record in tqdm(data, desc=f"Processing {source_file}"):
        # Extract required fields
        formula_pretty = record.get('formula_pretty', "")
        composition_reduced = record.get('composition_reduced', {})
        density = record.get('density', None)
        ordering = record.get('ordering', "")

        # Encode composition
        composition_encoded = encode_composition(composition_reduced)

        # Encode ordering
        ordering_encoded = encode_ordering(ordering)

        # Reconstruct the record with the specified key order
        new_record = {
            "formula_pretty": formula_pretty,
            "composition_reduced": composition_reduced,
            "composition_encoded": composition_encoded,
            "density": density,
            "ordering": ordering,
            "ordering_encoded": ordering_encoded
        }

        processed_data.append(new_record)

    # Save the processed data to the target file
    save_json(processed_data, target_path)

print("\n🎉 One-Hot Encoding and Data Transformation Complete!")

# Optional: Verify the new files
def inspect_new_files():
    print("\n🔍 Inspecting the newly created Phase 2 Data files:")
    new_files = [
        'dev_f2_p2.json',
        'test_f2_p2.json',
        'training_f2_p2.json',
        'training_dev_f2_p2.json'
    ]
    for file in new_files:
        file_path = os.path.join(TARGET_DIR, file)
        if os.path.exists(file_path):
            with open(file_path, 'r') as f:
                data = json.load(f)
            print(f"📄 {file}: {len(data)} records")
            if len(data) > 0:
                sample = data[0]
                print(f"📝 Sample record from {file}:")
                print(json.dumps(sample, indent=4))
        else:
            print(f"❌ File not found: {file_path}")

inspect_new_files()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Target directory already exists: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data

🔄 Processing file: dev_f2_p1.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/dev_f2_p1.json


Processing dev_f2_p1.json: 100%|██████████| 7500/7500 [00:00<00:00, 31228.38it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/dev_f2_p2.json

🔄 Processing file: test_f2_p1.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/test_f2_p1.json


Processing test_f2_p1.json: 100%|██████████| 7500/7500 [00:00<00:00, 29523.80it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/test_f2_p2.json

🔄 Processing file: training_f2_p1.json
✅ Loaded 129670 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/training_f2_p1.json


Processing training_f2_p1.json: 100%|██████████| 129670/129670 [00:05<00:00, 23004.67it/s]


✅ Saved 129670 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/training_f2_p2.json

🔄 Processing file: training_dev_f2_p1.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 1 Data/training_dev_f2_p1.json


Processing training_dev_f2_p1.json: 100%|██████████| 7500/7500 [00:00<00:00, 22861.05it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/training_dev_f2_p2.json

🎉 One-Hot Encoding and Data Transformation Complete!

🔍 Inspecting the newly created Phase 2 Data files:
📄 dev_f2_p2.json: 7500 records
📝 Sample record from dev_f2_p2.json:
{
    "formula_pretty": "Cs2MnH12(SeO7)2",
    "composition_reduced": {
        "Cs": 2.0,
        "Mn": 1.0,
        "H": 12.0,
        "Se": 2.0,
        "O": 14.0
    },
    "composition_encoded": [
        12.0,
        0,
        0,
        0,
        0,
        0,
        0,
        14.0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1.0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        2.0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,

# Turning the Data into Numpy Arrays

In [4]:
# Install required libraries
!pip install tqdm

# Import necessary libraries
import os
import json
import numpy as np
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define source and target directories
PHASE_2_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data'
PHASE_3_DIR = '/content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data'
DICTIONARIES_DIR = os.path.join(PHASE_3_DIR, 'dictionaries')
DATA_TENSORFLOW_DIR = os.path.join(PHASE_3_DIR, 'Data_Tensorflow')

# Create Phase 3 Data directory if it doesn't exist
if not os.path.exists(PHASE_3_DIR):
    os.makedirs(PHASE_3_DIR)
    print(f"✅ Created directory: {PHASE_3_DIR}")
else:
    print(f"✅ Directory already exists: {PHASE_3_DIR}")

# Create 'dictionaries' and 'Data_Tensorflow' subdirectories
for subdir in [DICTIONARIES_DIR, DATA_TENSORFLOW_DIR]:
    if not os.path.exists(subdir):
        os.makedirs(subdir)
        print(f"✅ Created subdirectory: {subdir}")
    else:
        print(f"✅ Subdirectory already exists: {subdir}")

# Define the four source files
source_files = {
    'dev_f2_p2.json': 'dev_f2_p2_dict.json',
    'test_f2_p2.json': 'test_f2_p2_dict.json',
    'training_f2_p2.json': 'training_f2_p2_dict.json',
    'training_dev_f2_p2.json': 'training_dev_f2_p2_dict.json'
}

# Define ordering classes
ordering_classes = ["FM", "NM", "FiM", "AFM"]

# Define a function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"❌ JSON decode error in file: {file_path}")
        return []
    except Exception as e:
        print(f"❌ Unexpected error loading {file_path}: {e}")
        return []

# Define a function to save JSON data
def save_json(data, file_path):
    try:
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"✅ Saved {len(data)} records to {file_path}")
    except Exception as e:
        print(f"❌ Error saving to {file_path}: {e}")

# Define a function to encode ordering
def encode_ordering(ordering):
    if ordering in ordering_classes:
        encoding = [1 if ordering == cls else 0 for cls in ordering_classes]
    else:
        encoding = [0] * len(ordering_classes)
    return encoding

# Function to process a single JSON file
def process_json_file(source_file, dict_filename):
    source_path = os.path.join(PHASE_2_DIR, source_file)
    dict_path = os.path.join(DICTIONARIES_DIR, dict_filename)

    # Load data
    data = load_json(source_path)
    if not data:
        print(f"❌ Skipping processing for {source_file} due to loading issues.")
        return

    # Create dictionary with unique keys
    formula_dict = {}
    formula_count = {}

    for record in tqdm(data, desc=f"Processing {source_file}"):
        formula = record.get('formula_pretty', 'Unknown')
        composition_encoded = record.get('composition_encoded', [])
        density = record.get('density', 0)
        ordering_encoded = record.get('ordering_encoded', [0]*len(ordering_classes))

        # Combine composition_encoded and density
        feature_vector = composition_encoded + [density]

        # Prepare the value
        value = [feature_vector, ordering_encoded]

        # Handle duplicate keys
        if formula in formula_dict:
            formula_count[formula] += 1
            new_key = f"{formula}_{formula_count[formula]}"
        else:
            formula_count[formula] = 1
            new_key = formula

        formula_dict[new_key] = value

    # Save the dictionary
    save_json(formula_dict, dict_path)

# Process all source files to create dictionaries
for source_file, dict_filename in source_files.items():
    process_json_file(source_file, dict_filename)

# Function to convert dictionaries to NumPy arrays
def convert_dict_to_numpy(dict_file, X_filename, Y_filename):
    dict_path = os.path.join(DICTIONARIES_DIR, dict_file)
    X_path = os.path.join(DATA_TENSORFLOW_DIR, X_filename)
    Y_path = os.path.join(DATA_TENSORFLOW_DIR, Y_filename)

    # Load the dictionary
    formula_dict = load_json(dict_path)
    if not formula_dict:
        print(f"❌ Skipping conversion for {dict_file} due to loading issues.")
        return

    # Initialize lists
    X_list = []
    Y_list = []

    # Iterate through the dictionary
    for key, value in formula_dict.items():
        feature_vector, ordering_encoded = value
        X_list.append(feature_vector)
        Y_list.append(ordering_encoded)

    # Convert to NumPy arrays
    X_array = np.array(X_list, dtype=np.float32)
    Y_array = np.array(Y_list, dtype=np.float32)

    # Save the arrays
    try:
        np.save(X_path, X_array)
        print(f"✅ Saved feature array to {X_path}")
    except Exception as e:
        print(f"❌ Error saving feature array to {X_path}: {e}")

    try:
        np.save(Y_path, Y_array)
        print(f"✅ Saved label array to {Y_path}")
    except Exception as e:
        print(f"❌ Error saving label array to {Y_path}: {e}")

# Define mapping from source dictionary files to NumPy filenames
numpy_files_mapping = {
    'dev_f2_p2_dict.json': ('X_dev_f2_p2.npy', 'Y_dev_f2_p2.npy'),
    'test_f2_p2_dict.json': ('X_test_f2_p2.npy', 'Y_test_f2_p2.npy'),
    'training_f2_p2_dict.json': ('X_training_f2_p2.npy', 'Y_training_f2_p2.npy'),
    'training_dev_f2_p2_dict.json': ('X_training_dev_f2_p2.npy', 'Y_training_dev_f2_p2.npy')
}

# Convert all dictionaries to NumPy arrays
for dict_file, (X_filename, Y_filename) in numpy_files_mapping.items():
    convert_dict_to_numpy(dict_file, X_filename, Y_filename)

print("\n🎉 All processing complete! The 'Phase 3 Data' folder now contains the 'dictionaries' and 'Data_Tensorflow' subfolders with the respective files.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Created directory: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data
✅ Created subdirectory: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries
✅ Created subdirectory: /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Data_Tensorflow
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/dev_f2_p2.json


Processing dev_f2_p2.json: 100%|██████████| 7500/7500 [00:00<00:00, 83817.24it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/dev_f2_p2_dict.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/test_f2_p2.json


Processing test_f2_p2.json: 100%|██████████| 7500/7500 [00:00<00:00, 136864.21it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/test_f2_p2_dict.json
✅ Loaded 129670 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/training_f2_p2.json


Processing training_f2_p2.json: 100%|██████████| 129670/129670 [00:01<00:00, 70012.25it/s]


✅ Saved 129670 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/training_f2_p2_dict.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 2 Data/training_dev_f2_p2.json


Processing training_dev_f2_p2.json: 100%|██████████| 7500/7500 [00:00<00:00, 265848.15it/s]


✅ Saved 7500 records to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/training_dev_f2_p2_dict.json
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/dev_f2_p2_dict.json
✅ Saved feature array to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Data_Tensorflow/X_dev_f2_p2.npy
✅ Saved label array to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Data_Tensorflow/Y_dev_f2_p2.npy
✅ Loaded 7500 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/test_f2_p2_dict.json
✅ Saved feature array to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Data_Tensorflow/X_test_f2_p2.npy
✅ Saved label array to /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/Data_Tensorflow/Y_test_f2_p2.npy
✅ Loaded 129670 records from /content/drive/MyDrive/Colab Notebooks/Datasets/Phase 3 Data/dictionaries/training_f2_p2_dict.json
✅ Saved feature array to /conten