In [1]:
# ❶ 依存ライブラリを明示的に固定＆インストール（互換性を保つため）
!pip install numpy==1.26.4
!pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1
!pip install matminer seaborn
!pip install git+https://github.com/usnistgov/alignn.git

Collecting torchvision==0.17.1
  Downloading torchvision-0.17.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.2.1
  Downloading torchaudio-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Downloading torchvision-0.17.1-cp311-cp311-manylinux1_x86_64.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchaudio-2.2.1-cp311-cp311-manylinux1_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision, torchaudio
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.21.0+cu124
    Uninstalling torchvision-0.21.0+cu124:
      Successfully uninstalled torchvision-0.21.0+cu124
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 2.6.0+cu124
    Uninstalling torchaudio-2.6.0+cu124:


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set universal theme for figures
plt.style.use('seaborn-v0_8')
# Configure plot settings
plt.rcParams['figure.figsize'] = (6, 4.5)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['axes.grid'] = True
# Set numerical precision
np.set_printoptions(precision=5)

In [5]:
import gzip
import json
import os
from tqdm import tqdm
from pymatgen.core import Structure

# ✅ JSON Lines形式で読み込み
with gzip.open("castelli_perovskites.json.gz", "rt") as f:
    data = [json.loads(line) for line in f if line.strip()]

# gapありデータ抽出
valid_entries = [d for d in data if d.get("gap gllbsc") is not None]

# ALIGNN形式データ構築
os.makedirs("data", exist_ok=True)
alignn_data = []
for i, entry in enumerate(tqdm(valid_entries[:500])):  # ←件数は調整可能
    try:
        struct = Structure.from_dict(entry["structure"])
        gap = entry["gap gllbsc"]
        poscar_path = f"data/{i}.poscar"
        struct.to(fmt="poscar", filename=poscar_path)
        alignn_data.append({"id": str(i), "target": gap, "poscar": poscar_path})
    except Exception as e:
        print(f"Error at {i}: {e}")

# 保存
with open("atomistic.json", "w") as f:
    json.dump(alignn_data, f, indent=2)


0it [00:00, ?it/s]


In [8]:
from pymatgen.core import Structure
import os
from tqdm import tqdm
import json

# バンドギャップが存在するデータのみ対象
df = df[df["gap gllbsc"].notnull()].reset_index(drop=True)

# 出力ディレクトリ作成
os.makedirs("data", exist_ok=True)

# ALIGNN用の入力リスト
alignn_data = []

# 500件程度に限定（任意で変更可）
for i, row in tqdm(df[:500].iterrows(), total=500):
    try:
        struct = Structure.from_dict(row["structure"])
        gap = row["gap gllbsc"]
        poscar_path = f"data/{i}.poscar"
        struct.to(fmt="poscar", filename=poscar_path)
        alignn_data.append({
            "id": str(i),
            "target": gap,
            "poscar": poscar_path
        })
    except Exception as e:
        print(f"Error in row {i}: {e}")

# ラベルファイル保存
with open("atomistic.json", "w") as f:
    json.dump(alignn_data, f, indent=2)


100%|██████████| 500/500 [00:01<00:00, 278.37it/s]


In [9]:
from alignn.train_config import TrainingConfig
import json

cfg = TrainingConfig(
    dataset="atomistic.json",
    output_dir="alignn_out",
    batch_size=32,
    epochs=20,
    learning_rate=0.001,
    task="regression",
    prediction_type="atomistic"  # ← POSCAR単位の回帰
)

# 設定ファイル保存
with open("config.json", "w") as f:
    json.dump(cfg.as_dict(), f, indent=2)


ModuleNotFoundError: No module named 'alignn.train_config'

In [10]:
!pip install git+https://github.com/usnistgov/alignn.git


Collecting git+https://github.com/usnistgov/alignn.git
  Cloning https://github.com/usnistgov/alignn.git to /tmp/pip-req-build-2i9_9zk7
  Running command git clone --filter=blob:none --quiet https://github.com/usnistgov/alignn.git /tmp/pip-req-build-2i9_9zk7
  Resolved https://github.com/usnistgov/alignn.git to commit 408bb6e996006dc4441ea48d4189047742dad637
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [11]:
# Step 1: 学習設定ファイル
from alignn.train_config import TrainingConfig
import json

cfg = TrainingConfig(
    dataset="atomistic.json",
    output_dir="alignn_out",
    batch_size=32,
    epochs=20,
    learning_rate=0.001,
    task="regression",
    prediction_type="atomistic"
)

with open("config.json", "w") as f:
    json.dump(cfg.as_dict(), f, indent=2)


ModuleNotFoundError: No module named 'alignn.train_config'

In [1]:
# 最新のALIGNNをGitHubからインストール
!pip install git+https://github.com/usnistgov/alignn.git

# その他必要ライブラリ
!pip install matminer pymatgen seaborn tqdm


Collecting git+https://github.com/usnistgov/alignn.git
  Cloning https://github.com/usnistgov/alignn.git to /tmp/pip-req-build-ycampbfy
  Running command git clone --filter=blob:none --quiet https://github.com/usnistgov/alignn.git /tmp/pip-req-build-ycampbfy
  Resolved https://github.com/usnistgov/alignn.git to commit 408bb6e996006dc4441ea48d4189047742dad637
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import pandas as pd
import gzip
import json

# Castelli Perovskites データを読み込み（matminerの形式）
from matminer.datasets import load_dataset
df = load_dataset("castelli_perovskites")
df = df[df["gap gllbsc"].notnull()].reset_index(drop=True)

# 内容確認（任意）
df.head()


Fetching castelli_perovskites.json.gz from https://ndownloader.figshare.com/files/13284197 to /usr/local/lib/python3.11/dist-packages/matminer/datasets/castelli_perovskites.json.gz


Fetching https://ndownloader.figshare.com/files/13284197 in MB: 4.9479679999999995MB [00:00, 372.53MB/s]               


Unnamed: 0,fermi level,fermi width,e_form,gap is direct,structure,mu_b,formula,vbm,cbm,gap gllbsc
0,0.312138,0.001837,2.16,True,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...",0.01974478,RhTeN3,6.187694,6.187694,0.0
1,0.297083,0.001837,1.52,True,"[[2.54041798 0. 0. ] Hf, [1.020...",-2.253054e-05,HfTeO3,6.033125,6.033125,0.0
2,0.191139,0.003675,1.48,True,"[[0.60790913 0. 0. ] Re, [2.186...",4.982109,ReAsO2F,6.602253,6.602253,0.0
3,0.316346,0.001837,1.24,True,"[[2.83091357 0. 0. ] W, [2.6573...",-0.8684496,WReO2S,5.738462,5.738462,0.0
4,0.312658,0.003675,0.62,True,"[[0.00518937 0. 0. ] Bi, [2.172...",2.164069e-15,BiHfO2F,6.074736,6.074736,0.0


In [3]:
from pymatgen.core import Structure
import os
from tqdm import tqdm

# 出力ディレクトリ作成
os.makedirs("data", exist_ok=True)
alignn_data = []

# 500件程度で軽く回す（必要に応じて増減可）
for i, row in tqdm(df[:500].iterrows(), total=500):
    try:
        struct = Structure.from_dict(row["structure"])
        gap = row["gap gllbsc"]
        fname = f"data/{i}.poscar"
        struct.to(fmt="poscar", filename=fname)
        alignn_data.append({"id": str(i), "target": gap, "poscar": fname})
    except Exception as e:
        print(f"Error at {i}: {e}")

# JSON形式で保存
with open("atomistic.json", "w") as f:
    json.dump(alignn_data, f, indent=2)


100%|██████████| 500/500 [00:00<00:00, 17013.91it/s]

Error at 0: list indices must be integers or slices, not str
Error at 1: list indices must be integers or slices, not str
Error at 2: list indices must be integers or slices, not str
Error at 3: list indices must be integers or slices, not str
Error at 4: list indices must be integers or slices, not str
Error at 5: list indices must be integers or slices, not str
Error at 6: list indices must be integers or slices, not str
Error at 7: list indices must be integers or slices, not str
Error at 8: list indices must be integers or slices, not str
Error at 9: list indices must be integers or slices, not str
Error at 10: list indices must be integers or slices, not str
Error at 11: list indices must be integers or slices, not str
Error at 12: list indices must be integers or slices, not str
Error at 13: list indices must be integers or slices, not str
Error at 14: list indices must be integers or slices, not str
Error at 15: list indices must be integers or slices, not str
Error at 16: list 




In [4]:
from alignn.train_config import TrainingConfig

cfg = TrainingConfig(
    dataset="atomistic.json",
    output_dir="alignn_out",
    batch_size=32,
    epochs=20,
    learning_rate=0.001,
    task="regression",
    prediction_type="atomistic"
)

with open("config.json", "w") as f:
    json.dump(cfg.as_dict(), f, indent=2)


ModuleNotFoundError: No module named 'alignn.train_config'

In [5]:
from pymatgen.core import Structure
import os
from tqdm import tqdm
import json

os.makedirs("data", exist_ok=True)
alignn_data = []

# エラーなく最大500件処理する
for i, row in tqdm(df.iloc[:500].iterrows(), total=500):
    try:
        struct = Structure.from_dict(row["structure"])
        gap = row["gap gllbsc"]
        fname = f"data/{i}.poscar"
        struct.to(fmt="poscar", filename=fname)
        alignn_data.append({
            "id": str(i),
            "target": gap,
            "poscar": fname
        })
    except Exception as e:
        print(f"Error at {i}: {e}")

# JSONに保存
with open("atomistic.json", "w") as f:
    json.dump(alignn_data, f, indent=2)


100%|██████████| 500/500 [00:00<00:00, 16613.61it/s]

Error at 0: list indices must be integers or slices, not str
Error at 1: list indices must be integers or slices, not str
Error at 2: list indices must be integers or slices, not str
Error at 3: list indices must be integers or slices, not str
Error at 4: list indices must be integers or slices, not str
Error at 5: list indices must be integers or slices, not str
Error at 6: list indices must be integers or slices, not str
Error at 7: list indices must be integers or slices, not str
Error at 8: list indices must be integers or slices, not str
Error at 9: list indices must be integers or slices, not str
Error at 10: list indices must be integers or slices, not str
Error at 11: list indices must be integers or slices, not str
Error at 12: list indices must be integers or slices, not str
Error at 13: list indices must be integers or slices, not str
Error at 14: list indices must be integers or slices, not str
Error at 15: list indices must be integers or slices, not str
Error at 16: list 




In [6]:
print(type(df.loc[0, "structure"]))
print(df.loc[0, "structure"])


<class 'pymatgen.core.structure.Structure'>
Full Formula (Te1 Rh1 N3)
Reduced Formula: TeRhN3
abc   :   3.954531   3.954531   3.954531
angles:  90.000000  90.000000  90.000000
pbc   :       True       True       True
Sites (5)
  #  SP      a    b    c
---  ----  ---  ---  ---
  0  Rh    0    0    0
  1  Te    0.5  0.5  0.5
  2  N     0.5  0    0.5
  3  N     0.5  0.5  0
  4  N     0    0.5  0.5


In [7]:
from pymatgen.core import Structure
import os
import json
from tqdm import tqdm

os.makedirs("data", exist_ok=True)
alignn_data = []

for i, row in tqdm(df.iloc[:500].iterrows(), total=500):
    try:
        struct = row["structure"]  # ← すでにStructureオブジェクト
        gap = row["gap gllbsc"]
        fname = f"data/{i}.poscar"
        struct.to(fmt="poscar", filename=fname)
        alignn_data.append({
            "id": str(i),
            "target": gap,
            "poscar": fname
        })
    except Exception as e:
        print(f"Error at {i}: {e}")

with open("atomistic.json", "w") as f:
    json.dump(alignn_data, f, indent=2)


100%|██████████| 500/500 [00:02<00:00, 235.21it/s]


In [8]:
from alignn.train_config import TrainingConfig
import json

cfg = TrainingConfig(
    dataset="atomistic.json",
    target="target",
    id_field="id",
    n_epochs=20,
    batch_size=32,
    learning_rate=1e-3,
    classification=False,
    store_results=True,
)

with open("config.json", "w") as f:
    json.dump(cfg.as_dict(), f, indent=2)


ModuleNotFoundError: No module named 'alignn.train_config'