# Li-I材料データの解析 (2023/2)

<div style="text-align: right;">
作成日:2023年02月19日<br>
作成者:中山将伸<br><br>
</div>

## 解析の流れ

１） Materials Project から Li-Iを含む材料のデータ抽出　（組成と結晶構造、物性：バンドギャップ）<BR>
２） chemhist, rdf/adf による記述子作成<BR>
３） 回帰分析と分類分析<BR>

In [None]:
# !pip install mp-api
# !pip install chemhist 
# !pip install rdfadf


import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## Materials Projectからのデータ抽出

<strong> inset-APIkey-here </strong>と書かれた場所に自身のAPIキーを入力してください。<br> 
keyは https://materialsproject.org/api#documentation から参照できます(2023/2/5現在)。<br>


In [None]:
from mp_api.client import MPRester

#api_key=os.environ['MPIDN']　環境変数 MPIDNから取得する場合
#api_key="ir30n**********"  入力例　
api_key="---Insert your API key---"

# output_Li-I フォルダーの作成
if not os.path.isdir('output_Li-I'):
    os.mkdir('output_Li-I')

with MPRester(api_key) as mpr:

    # structure フィールドを追加
    docs = mpr.summary.search(
        elements=["Li", "I"],
        fields=["material_id", "formula_pretty", "band_gap", "structure"]
    )

    # CSV 書き込み
    csv_path = 'output_Li-I/example_Li-I.csv'
    with open(csv_path, 'w') as f:
        # ヘッダーに m_or_i を追加
        f.write('material_id,formula_pretty,band_gap,m_or_i\n')

        for doc in docs:
            mpid = doc.material_id
            formula = doc.formula_pretty
            bg = doc.band_gap

            # ======== band_gap の分類ロジック ========
            # 0.01 より小さい → "m"
            # それ以外 → "i"
            if bg is None:
                m_or_i = "unknown"
            elif bg < 0.01:
                m_or_i = "m"
            else:
                m_or_i = "i"

            # CSV 出力
            f.write(f"{mpid},{formula},{bg},{m_or_i}\n")

            # CIF 保存（material_id.cif 形式）
            cif_filename = f"output_Li-I/{mpid}.cif"
            doc.structure.to(filename=cif_filename)

print("CSV と CIFの保存が完了しました。")


In [None]:
# データテーブル確認と　Pandas DataFrame 作成
print (csv_path)
df = pd.read_csv(csv_path)
print ("number of samples",len(df))
df.head()
df_mp=df.copy()
df_mp

## chemhist, rdfadf を使って組成/構造記述子取り出し



In [None]:

from pymatgen.core.composition import Composition
import chemhist
from rdfadf.rdf import compute_general_rdf
from rdfadf.adf import compute_general_adf


In [None]:

# chemhistによる特徴  (例題のノートパソコンでの計算所要時間  50sec程度）

descriptor_list = []   # 各化学式のベクトルを格納
label_list = []        # chemvec のラベル（1 回だけ取得できれば OK）

def normalize_formula(formula):
    # スペース削除
    formula = formula.replace(" ", "")
    # Composition で括弧を展開
    comp = Composition(formula)
    comp = comp.alphabetical_formula.replace(" ", "")  # スペース削除  
    # アルファベット順の標準形に変換
    return comp

descriptor_list = []
labels_global = None

for formula in df['formula_pretty']:

    norm = normalize_formula(formula)        # 化学式の正規化（Li2(CO3) --> C1Li2O3）
    #print("original:", formula, " → normalized:", norm)

    # descriptor 取得
    chemvec, labels = chemhist.get_descriptor(norm)

    if labels_global is None:
        labels_global = labels

    descriptor_list.append(chemvec)

# DataFrame 化
df_chemhist = pd.DataFrame(descriptor_list, columns=labels)




In [None]:
df_chemhist.head()

In [None]:
# rdfadf による RDF 特徴量計算

import os
import pandas as pd
from ase.io import read
from rdfadf.rdf import compute_general_rdf
import numpy as np

# ===================================
# 1. Input CSV 読み込み
# ===================================
csv_path = "output_Li-I/exampleLi-I_.csv"
df = pd.read_csv(csv_path)

# ===================================
# 2. RDF 設定
# ===================================
rdf_cutoff = 10.0
rdf_bins   = 50
rdf_sigma  = 0.2

combinations = [
    ("Li", "I"),
    ("Li", "Li"),
    ("I", "I"),
    ("s", "p"),
    ("s", "d"),
    ("p", "d")
]

# ===================================
# 3. RDF 全体格納用
# ===================================
all_rdf_rows = []

# ===================================
# 4. 各材料ごとに RDF を 1 行にまとめて作成
# ===================================
for mid in df["material_id"]:

    cif_path = f"output_Li-I/{mid}.cif"
    atoms = read(cif_path)

    one_row_vec = []   # ← material_id 1個 の RDFをすべて入れる

    for e1, e2 in combinations:

        r, gr, grs, gr2, gr2s, cr, crs = compute_general_rdf(
            atoms,
            e1,
            e2,
            rcut=rdf_cutoff,
            bins=rdf_bins,
            sigma=rdf_sigma
        )

        one_row_vec.extend(gr2s)  # ← 50 次元 × 6 種 = 300 次元ベクトル

    all_rdf_rows.append(one_row_vec)

# ===================================
# 5. DataFrame 化
# ===================================
# 列名作成
colnames = []
for e1, e2 in combinations:
    colnames += [f"rdf_{e1}-{e2}_{i}" for i in range(rdf_bins)]

df_rdfall = pd.DataFrame(all_rdf_rows, columns=colnames)

print(df_rdfall.head())
print(df_rdfall.shape)


In [None]:
#全ての記述子を結合
df_all = pd.DataFrame() # 空のpandas dataframe

df_all = pd.concat([df_mp, df_chemhist, df_rdfall], axis=1)
df_all.to_csv('LiI_bandgap_descriptors.csv', index=False)
df_all.head()


## 回帰分析 (Target: band_gap) by Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_predict,
    RandomizedSearchCV
)
from sklearn.metrics import r2_score, mean_squared_error

# ===================================
# 0. band_gap を float に変換 & NaN を削除
# ===================================
df_all["band_gap"] = pd.to_numeric(df_all["band_gap"], errors="coerce")
df_all = df_all.dropna(subset=["band_gap"]).reset_index(drop=True)

# ===================================
# 1. 特徴量と目的変数
# ===================================
drop_cols = ["material_id", "formula_pretty", "band_gap", "m_or_i"]

X = df_all.drop(columns=drop_cols)
y = df_all["band_gap"]
sample_id = df_all["material_id"]

# 数値だけ使用
X = X.select_dtypes(include=[np.number])

# NaN を平均で補完
X = X.fillna(X.mean())

# ===================================
# 2. Train / Test 分割
# ===================================
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, sample_id, test_size=0.30, random_state=0
)

# ===================================
# 3. ハイパーパラメータ探索範囲
# ===================================
param_dist = {
    "n_estimators": [200, 400, 600, 800, 1000],
    "max_depth": [None, 5, 10, 20, 30, 50],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["auto", "sqrt", "log2"],
    "bootstrap": [True, False]
}

rf_base = RandomForestRegressor(random_state=0, n_jobs=-1)

# ===================================
# 4. RandomizedSearchCV（5-fold CV）
# ===================================
search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_dist,
    n_iter=50,               # 試行回数（増やすと精度UP）
    cv=5,
    random_state=0,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print("===== Best Hyperparameters =====")
print(search.best_params_)
print("--------------------------------")

best_rf = search.best_estimator_

# ===================================
# 5. Cross validation predictions
# ===================================
kf = KFold(n_splits=5, shuffle=True, random_state=0)
y_pred_cv = cross_val_predict(best_rf, X_train, y_train, cv=kf, n_jobs=-1)

print("===== 5-fold CV performance =====")
print("R2 :", r2_score(y_train, y_pred_cv))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_cv)))

# ===================================
# 6. train 全体で fit → test 予測
# ===================================
best_rf.fit(X_train, y_train)
y_pred_test = best_rf.predict(X_test)

print("\n===== TEST performance =====")
print("R2 :", r2_score(y_test, y_pred_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

# ===================================
# 7. 結果 DataFrame
# ===================================
result_df = pd.DataFrame({
    "material_id": id_test.values,
    "y_true": y_test.values,
    "y_pred": y_pred_test
})

# ===================================
# 8. 診断プロット（train=黒, test=赤）
# ===================================
plt.figure(figsize=(7, 7))

plt.scatter(y_train, y_pred_cv, color="black", s=40, alpha=0.6, label="Train+CV")
plt.scatter(y_test, y_pred_test, color="red", s=60, alpha=0.8, label="Test")

min_val = min(y.min(), y_pred_cv.min(), y_pred_test.min())
max_val = max(y.max(), y_pred_cv.max(), y_pred_test.max())
plt.plot([min_val, max_val], [min_val, max_val], "k--")

plt.xlabel("True band_gap", fontsize=14)
plt.ylabel("Predicted band_gap", fontsize=14)
plt.title("Random Forest Regression (Tuned) Diagnostic Plot", fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## 分類分析 (Target: m_or_i) by Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_curve,
    auc,
    classification_report,
    confusion_matrix
)

# ===================================
# 0. 目的変数の NaN を除去
# ===================================
df_all["m_or_i"] = df_all["m_or_i"].replace("", np.nan)
df_all = df_all.dropna(subset=["m_or_i"]).reset_index(drop=True)

# ===================================
# 1. 特徴量と目的変数
# ===================================
drop_cols = ["material_id", "formula_pretty", "band_gap", "m_or_i"]

X = df_all.drop(columns=drop_cols)
y = df_all["m_or_i"]
sample_id = df_all["material_id"]

# 数値列のみ
X = X.select_dtypes(include=[np.number])

# NaN を平均値で補完
X = X.fillna(X.mean())

# ラベル変換
label_map = {"m": 0, "i": 1}

# ===================================
# 2. Train-Test split（30% test）
# ===================================
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, sample_id,
    test_size=0.30,
    random_state=0,
    stratify=y
)

y_train_num = y_train.map(label_map)
y_test_num = y_test.map(label_map)

# ===================================
# 3. ランダムフォレスト分類器
# ===================================
clf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

# ===================================
# 4. 5-fold CV の予測（train のみ）
# ===================================
kf = KFold(n_splits=5, shuffle=True, random_state=0)

y_train_proba_cv = cross_val_predict(
    clf, X_train, y_train, cv=kf,
    n_jobs=-1, method="predict_proba"
)

# クラス "i" の index（pos=1）
clf.fit(X_train, y_train)
pos_index = list(clf.classes_).index("i")

# CV用 positive class 確率
y_train_proba_cv = y_train_proba_cv[:, pos_index]
y_pred_cv = (y_train_proba_cv >= 0.5).astype(int)

# ===================================
# 5. CV 評価
# ===================================
acc_cv = accuracy_score(y_train_num, y_pred_cv)
f1_cv = f1_score(y_train_num, y_pred_cv)
fpr_cv, tpr_cv, _ = roc_curve(y_train_num, y_train_proba_cv)
auc_cv = auc(fpr_cv, tpr_cv)

print("===== 5-fold Cross Validation =====")
print("Accuracy:", acc_cv)
print("F1 score:", f1_cv)
print("AUC:", auc_cv)

# ---- CV 混同行列 ----
cm_cv = confusion_matrix(y_train_num, y_pred_cv)
print("\nConfusion Matrix (CV):\n", cm_cv)

# ===================================
# 6. train 全体で fit → test 予測
# ===================================
clf.fit(X_train, y_train)

pos_index = list(clf.classes_).index("i")
y_test_proba = clf.predict_proba(X_test)[:, pos_index]
y_pred_test = clf.predict(X_test)
y_pred_test_num = pd.Series(y_pred_test).map(label_map)

acc_test = accuracy_score(y_test_num, y_pred_test_num)
f1_test = f1_score(y_test_num, y_pred_test_num)

fpr_test, tpr_test, _ = roc_curve(y_test_num, y_test_proba)
auc_test = auc(fpr_test, tpr_test)

print("\n===== Test Performance =====")
print("Accuracy:", acc_test)
print("F1 score:", f1_test)
print("AUC:", auc_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_test))

# ---- TEST 混同行列 ----
cm_test = confusion_matrix(y_test_num, y_pred_test_num)
print("\nConfusion Matrix (Test):\n", cm_test)

# ===================================
# 7. ROC 曲線プロット（CV と Test の両方）
# ===================================
plt.figure(figsize=(7,7))

plt.plot(fpr_cv, tpr_cv, label=f"Train+CV ROC (AUC={auc_cv:.3f})", color="black")
plt.plot(fpr_test, tpr_test, label=f"Test ROC (AUC={auc_test:.3f})", color="red")

plt.plot([0,1], [0,1], "k--", lw=1)

plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
plt.title("ROC Curve for Random Forest Classification", fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ===================================
# 8. 混同行列 Heatmap（任意）
# ===================================
plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
sns.heatmap(cm_cv, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (CV)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.subplot(1,2,2)
sns.heatmap(cm_test, annot=True, fmt="d", cmap="Reds")
plt.title("Confusion Matrix (Test)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.tight_layout()
plt.show()


In [None]:
# colab ユーザー用

import shutil
from google.colab import files
import os

# outputフォルダをzip圧縮
zip_filename = 'output_files'
output_directory = 'output_Li-I'
shutil.make_archive(zip_filename, 'zip', output_directory)

# 生成されたzipファイルをダウンロード
files.download(zip_filename + '.zip')