In [None]:

# ============================================================
# 0. ライブラリのインポート
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

%matplotlib inline


In [None]:

# ============================================================
# 1. データの読み込み
# ============================================================
data_path = "https://raw.githubusercontent.com/ChemicalBatteryLab-Nitech/dxgem_day1day2/main/data/olivineDataset_withEA.csv"

df = pd.read_csv(data_path)

display(df.head())
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())


In [None]:

# ============================================================
# 2. 列の指定と、目的変数 NaN 行のドロップ
# ============================================================
index_col = "Index#"
label_col = "Composition"
target_col = "EA (eV)"

descriptor_cols = df.columns[2:-2]

print("Descriptor columns:", descriptor_cols.tolist())
print("Target column:", target_col)

mask = df[target_col].notna()
df_train = df[mask].copy()

print("Number of samples with EA:", df_train.shape[0])


In [None]:

# ============================================================
# 3. 学習用データ行列 X, y
# ============================================================
X = df_train[descriptor_cols].values
y = df_train[target_col].values

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:

# ============================================================
# 4. K-fold CV + PRESS による因子数決定
# ============================================================
N_SPLITS = 5
MAX_COMPONENTS = 10

n_samples = X.shape[0]
n_features = X.shape[1]
min_train_size = n_samples - n_samples // N_SPLITS
max_components_allowed = min(MAX_COMPONENTS, n_features, max(1, min_train_size - 1))

print("n_samples:", n_samples)
print("n_features:", n_features)
print("max_components_allowed:", max_components_allowed)

kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
press_values = {}

for n_comp in range(1, max_components_allowed + 1):
    sq_errors = []
    for tr_idx, val_idx in kfold.split(X):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        pls = PLSRegression(n_components=n_comp, scale=True)
        pls.fit(X_tr, y_tr)

        y_pred_val = pls.predict(X_val).ravel()
        sq_errors.append(((y_val - y_pred_val) ** 2).sum())

    press_values[n_comp] = np.sum(sq_errors)

for n_comp, press in press_values.items():
    print(f"n_components = {n_comp:2d}, PRESS = {press:.6f}")

best_n_components = min(press_values, key=press_values.get)
print("\nBest n_components (min PRESS):", best_n_components)


In [None]:

# ============================================================
# 5. PRESS プロット
# ============================================================
plt.figure(figsize=(6,4))
plt.plot(list(press_values.keys()), list(press_values.values()), marker="o")
plt.xlabel("Number of PLS components")
plt.ylabel("PRESS")
plt.title("PRESS vs Number of Components (PLS)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# ============================================================
# 6. 最適因子数で再学習
# ============================================================
pls_opt = PLSRegression(n_components=best_n_components, scale=True)
pls_opt.fit(X, y)

y_pred_train = pls_opt.predict(X).ravel()

r2 = r2_score(y, y_pred_train)
rmse = np.sqrt(mean_squared_error(y, y_pred_train))

print(f"R^2 (all known EA samples)  : {r2:.4f}")
print(f"RMSE (all known EA samples): {rmse:.4f} eV")


In [None]:

# ============================================================
# 7. 診断プロット（Parity Plot）
# ============================================================
plt.figure(figsize=(5,5))
plt.scatter(y, y_pred_train, alpha=0.8)

min_val = min(np.min(y), np.min(y_pred_train))
max_val = max(np.max(y), np.max(y_pred_train))
plt.plot([min_val, max_val], [min_val, max_val], "k--")

plt.xlabel("Observed EA (eV)")
plt.ylabel("Predicted EA (eV)")
plt.title(f"PLS Regression (n_components={best_n_components})")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# ============================================================
# 8. Residuals vs Predicted
# ============================================================
residuals = y - y_pred_train

plt.figure(figsize=(6,4))
plt.scatter(y_pred_train, residuals, alpha=0.8)
plt.axhline(0, color="k", linestyle="--")
plt.xlabel("Predicted EA (eV)")
plt.ylabel("Residual (Observed - Predicted)")
plt.title("Residuals vs Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# ============================================================
# 9. Residual histogram
# ============================================================
plt.figure(figsize=(6,4))
plt.hist(residuals, bins=10, edgecolor="k")
plt.xlabel("Residual (Observed - Predicted)")
plt.ylabel("Count")
plt.title("Histogram of Residuals")
plt.tight_layout()
plt.show()


In [None]:

# ============================================================
# 10. 全サンプルに対する予測
# ============================================================
X_all_samples = df[descriptor_cols].values
EA_pred_all = pls_opt.predict(X_all_samples).ravel()

df_pred = df.copy()
df_pred["EA_pred (eV)"] = EA_pred_all

display(df_pred[[index_col, label_col, target_col, "EA_pred (eV)"]].head(10))

out_path = "olivineDataset_withEA_PLS_pred.csv"
df_pred.to_csv(out_path, index=False)
print("Saved prediction results to:", out_path)
