#### Data Preparation: 

In preparation step, we load two files of datases. then, concatenate the training and validation set into one dataset for subsequent analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Arcene datasets
# Replace 'path/to/' with the actual path to your data files
X_train = pd.read_csv('arcene_train.data', header=None, sep=' ')
X_valid = pd.read_csv('arcene_valid.data', header=None, sep=' ')

# Drop the last column, which is all NaN
X_train = X_train.iloc[:, :-1]
X_valid = X_valid.iloc[:, :-1]

# Combine training and validation data
X = pd.concat([X_train, X_valid], axis=0)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data loaded and preprocessed.")
#print("Scaled data shape:", X_scaled.shape)

#### PCA

In [None]:
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Plot 1: 2D scatter plot of first two principal components
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis', style=y)
plt.title('PCA: First Two Principal Components of Arcene Dataset')
plt.xlabel(f'PC1 ({explained_variance_ratio[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({explained_variance_ratio[1]*100:.2f}% variance)')
plt.legend(title='Class', loc='best')
plt.savefig('pca_plot.png')
plt.close()

# Plot 2: Scree plot for explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, 'bo-', label='Explained Variance Ratio')
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-', label='Cumulative Variance')
plt.title('Scree Plot for PCA on Arcene Dataset')
plt.xlabel('Principal Component')
plt.ylabel('Variance Ratio')
plt.legend()
plt.grid(True)
plt.savefig('scree_plot.png')
plt.close()

# Print explained variance for first two components
print(f"Explained variance by PC1: {explained_variance_ratio[0]*100:.2f}%")
print(f"Explained variance by PC2: {explained_variance_ratio[1]*100:.2f}%")
print(f"Total variance by first two PCs: {cumulative_variance[1]*100:.2f}%")