## Form a custom dataset by stacking cif files

In [18]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from pymatgen.core import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
import csv

# === Set your input/output directories ===
cif_dir = "./test/raw1"
output_dir = "./test/raw1_processed"
os.makedirs(output_dir, exist_ok=True)


records = []


for filename in os.listdir(cif_dir):
    if not filename.endswith(".cif"):
        continue

    path = os.path.join(cif_dir, filename)
    try:
        with open(path, "r", encoding="utf-8") as f:
            cif_text = f.read().replace('"', "'")

        structure = Structure.from_str(cif_text, fmt="cif")
        sga = SpacegroupAnalyzer(structure)

        record = {
            "material_id": filename.replace(".cif", ""),
            "pretty_formula": structure.composition.reduced_formula,
            "elements": str([el.symbol for el in structure.composition.elements]),
            "cif": cif_text,
            "spacegroup_number": sga.get_space_group_number(),
        }
        records.append(record)

    except Exception as e:
        print(f"Skipping {filename}: {e}")

# === Create DataFrame
df = pd.DataFrame(records)
df.index.name = ""  # To enable leading blank index column like MP

# === Split ===
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# === Save CSVs with header and quotes ===
for name, split_df in zip(["train", "val", "test"], [train_df, val_df, test_df]):
    out_path = os.path.join(output_dir, f"{name}.csv")
    split_df.to_csv(
        out_path,
        index=True,  # Include blank unnamed index column
        quoting=csv.QUOTE_ALL,
        quotechar='"',
        escapechar='\\'
    )