In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
# Load data
INPUT = Path('features_gB4.csv')
df = pd.read_csv(INPUT)
Y = df['band_gap']
X = df.select_dtypes(include=[np.number]).drop(columns=['band_gap'])

In [None]:
# Split data
np.random.seed(42)
indices = np.arange(len(X))
np.random.shuffle(indices)
n_train = int(0.7 * len(X))
n_val = int(0.2 * len(X))
train_idx = indices[:n_train]
val_idx = indices[n_train:n_train+n_val]
test_idx = indices[n_train+n_val:]
X_train, X_val, X_test = X.iloc[train_idx], X.iloc[val_idx], X.iloc[test_idx]
Y_train, Y_val, Y_test = Y.iloc[train_idx], Y.iloc[val_idx], Y.iloc[test_idx]
pd.DataFrame(Y_train, columns=['band_gap']).to_csv('train_target_gB4.csv', index=False)
pd.DataFrame(Y_val, columns=['band_gap']).to_csv('validation_target_gB4.csv', index=False)
pd.DataFrame(Y_test, columns=['band_gap']).to_csv('test_target_gB4.csv', index=False)

In [None]:
# Standardize features (zero mean, unit variance, fit on train only)
mean = X_train.mean()
std = X_train.std(ddof=0)
X_train_std = (X_train - mean) / std
X_val_std = (X_val - mean) / std
X_test_std = (X_test - mean) / std

In [None]:
# PCA (fit on train, apply to all)
def pca_numpy(X, n_components=None):
    X_centered = X - X.mean(axis=0)
    cov = np.cov(X_centered, rowvar=False)
    eigvals, eigvecs = np.linalg.eigh(cov)
    idx = np.argsort(eigvals)[::-1]
    eigvals, eigvecs = eigvals[idx], eigvecs[:, idx]
    if n_components is not None:
        eigvecs = eigvecs[:, :n_components]
    return eigvecs, eigvals

eigvecs, eigvals = pca_numpy(X_train_std.values)
explained_variance = eigvals / eigvals.sum()
cum_var = np.cumsum(explained_variance)
# Choose number of components (e.g., 95% variance)
n_comp = np.argmax(cum_var >= 0.95) + 1
print(f'Number of components for 95% variance: {n_comp}')
# Project data
X_train_pca = np.dot(X_train_std, eigvecs[:, :n_comp])
X_val_pca = np.dot(X_val_std, eigvecs[:, :n_comp])
X_test_pca = np.dot(X_test_std, eigvecs[:, :n_comp])

In [None]:
# Save reduced datasets
cols = [f'PC{i+1}' for i in range(n_comp)]
pd.DataFrame(X_train_pca, columns=cols).to_csv('train_reduced_gB4.csv', index=False)
pd.DataFrame(X_val_pca, columns=cols).to_csv('validation_reduced_gB4.csv', index=False)
pd.DataFrame(X_test_pca, columns=cols).to_csv('test_reduced_gB4.csv', index=False)

In [None]:
# Optionally, standardize the targets (fit on train only)
y_mean = Y_train.mean()
y_std = Y_train.std(ddof=0)
Y_train_std = (Y_train - y_mean) / y_std
Y_val_std = (Y_val - y_mean) / y_std
Y_test_std = (Y_test - y_mean) / y_std
pd.DataFrame(Y_train_std, columns=['band_gap']).to_csv('train_target_gB4.csv', index=False)
pd.DataFrame(Y_val_std, columns=['band_gap']).to_csv('validation_target_gB4.csv', index=False)
pd.DataFrame(Y_test_std, columns=['band_gap']).to_csv('test_target_gB4.csv', index=False)