In [1]:
import os
import pandas as pd
import numpy as np
from numpy.polynomial.legendre import legval, legvander

In [2]:
inpath = 'D:\\OneDrive - University of Missouri\\transfer_desktop\MU\\2025S_Mod_chapter2\\data_clean'
savedir = 'D:\\OneDrive - University of Missouri\\transfer_desktop\MU\\project 3\\1_try\\output'
os.makedirs(savedir, exist_ok=True)

In [3]:
# Import data
data = pd.read_csv(os.path.join(inpath, 'NC_main.csv'), index_col=None)
data['datetime_UTC'] = pd.to_datetime(data['datetime_UTC'])
data['Year'] = data['Year'].astype(int)
data['Month'] = data['Month'].astype(int)
data['Day'] = data['Day'].astype(int)
data['Hour'] = data['Hour'].astype(int)
#data['yyyymm'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)
data['logy'] = np.log(data['load'])

In [4]:
# Subsample
dt0 = data[(data['Hour']==0) & (data['Month'].isin([4, 5]))] # nobs=1403
dt0 = dt0.sort_values(by='datetime_UTC').reset_index(drop=True)

In [6]:
###############################################
#   Supervised FPCA - determine the order M   #
###############################################
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
# Normalize temperature to [-1, 1] for Legendre basis  
def normal2(x):  
    x_min, x_max = x.min(), x.max()  
    return 2*(x-x_min)/(x_max-x_min) - 1

y = dt0['logy']
x_grid = normal2(dt0['temperature'])
X_poly = legvander(x_grid, deg=99) # degree from 0 to 99. shape (T, 100)

# Center or scale the data
#X_cent = X_poly - X_poly.mean(axis=0) # just center the data
scaler = StandardScaler(with_mean=True, with_std=True) # center + scale to unit variance
X_scale = scaler.fit_transform(X_poly)

########################################
# CV selection of number of components #
########################################
max_c = X_poly.shape[1]  # max number of components
kf = KFold(n_splits=5, shuffle=True, random_state=114514)
mse_cv = np.zeros(max_c)
mse_cv_se = np.zeros(max_c)
for n_cp in range(1, max_c+1):
    fold_mses = []
    for train_idx, test_idx in kf.split(X_scale):
        pls = PLSRegression(n_components=n_cp)
        pls.fit(X_scale[train_idx], y[train_idx])
        yhat = pls.predict(X_scale[test_idx]).ravel()
        fold_mses.append(mean_squared_error(y[test_idx], yhat))
    fold_mses = np.array(fold_mses)
    mse_cv[n_cp-1] = fold_mses.mean()
    mse_cv_se[n_cp-1] = fold_mses.std(ddof=1) / np.sqrt(len(fold_mses))

k_select = int(np.argmin(mse_cv) + 1) # minimze MSE
print(f'Selected number of components: {k_select}')

pls_final = PLSRegression(n_components=k_select)
pls_final.fit(X_scale, y)
x_weights = pls_final.x_weights_ # shape (100, k_select)
x_scores = pls_final.x_scores_ # shape (T, k_select)

In [None]:
# Save results
df_weights = pd.DataFrame(x_weights, columns=[f'PC{j+1}' for j in range(k_select)])
df_weights['degree'] = np.arange(100)
df_weights.to_csv(os.path.join(savedir, 'x_weights.csv'), index=False)
df_scores = pd.DataFrame(x_scores, columns=[f'PC{j+1}' for j in range(k_select)])
df_scores['datetime_UTC'] = dt0['datetime_UTC']
df_scores['logy'] = y
df_scores.to_csv(os.path.join(savedir, 'x_scores.csv'), index=False)

In [None]:
# Option (B) Select basis functions based on weights
# Identify top contributing original degrees per component (by abs weight)
phi_coef = x_weights.copy()
row_norm = np.sqrt(np.sum(phi_coef**2, axis=1))
order = np.argsort(row_norm)[::-1] # descending order
labels = order.astype(str)
row_norm = row_norm[order]

# Plot the row norm vs degree
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 4))
plt.plot(np.arange(100), row_norm)
for i in np.arange(100):
    plt.text(i, row_norm[i], labels[i], fontsize=9, ha='right', va='bottom')
plt.xlabel('Ranking')
plt.ylabel('L2 norm of weights')
plt.savefig(os.path.join(savedir, 'degreesbynorm.png'), dpi=300)

In [None]:
# Visually cut at the top six
top_degrees = [1, 2, 4, 7, 6, 5]
X_b = X_poly[:, top_degrees] # select the top six degrees
# Save the selected basis functions
df_basis = pd.DataFrame(X_b, columns=[f'phi{d}' for d in top_degrees])
df_basis['datetime_UTC'] = dt0['datetime_UTC']
df_basis['logy'] = y
df_basis.to_csv(os.path.join(savedir, 'x_basis_topdegree.csv'), index=False)