In [None]:
import numpy as np
from scipy.stats import pearsonr
import sys
sys.path.append('')

from data.data_retriever import Dataretreiver

In [3]:
%load_ext autoreload
%autoreload 2

In [None]:
data = Dataretreiver(reduce='pca_pure', debug=True)


In [None]:
last_date = wind_df.index[-1]
elec_df = elec_df[:last_date]
wind_df.tail(), elec_df.tail()

In [None]:
correlations = {}
for col in wind_df.columns:
    corr, _ = pearsonr(wind_df[col], elec_df['price'])
    correlations[col] = corr

sorted_cols = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

n_points = 10 
top_columns = [col for col, corr in sorted_cols[:n_points]]
print(sorted_cols)
print(f"Top {n_points} wind points most correlated with price:")
for col in top_columns:
    print(f"{col}: correlation = {correlations[col]:.4f}")

reduced_data = wind_df[top_columns]

full_training_data = reduced_data.join(elec_df)

print("\nReduced feature set (head):")
print(full_training_data.head())


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

scaler = StandardScaler()
data_scaled = pd.DataFrame(
    scaler.fit_transform(wind_df),
    index=wind_df.index,
    columns=wind_df.columns
)

n_pca_components = 10 
pca = PCA(n_components=n_pca_components)
pca_features = pca.fit_transform(data_scaled.T)  

print(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.2%} total variance retained")

n_clusters = 10 
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(pca_features)

representatives = []
for cluster_id in range(n_clusters):
    cluster_indices = np.where(labels == cluster_id)[0]
    cluster_points = pca_features[cluster_indices]
    center = kmeans.cluster_centers_[cluster_id]
    closest_idx = cluster_indices[np.argmin(np.linalg.norm(cluster_points - center, axis=1))]
    representatives.append(wind_df.columns[closest_idx])

print("\nSelected representative wind points:")
print(representatives)

reduced_data = wind_df[representatives]

full_training_data = reduced_data.join(elec_df)

print("\nReduced feature set (head):")
print(full_training_data.head())

plt.figure(figsize=(8, 6))
for label in np.unique(labels):
    idx = labels == label
    plt.scatter(pca_features[idx, 0], pca_features[idx, 1], label=f'Cluster {label}')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color='black', marker='x', label='Centers')
plt.title('Wind points clustered in PCA space')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
