In [3]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np
from lib.utils import get_cleaned_normalized_data, load_all_resale_data, get_train_split_data


X, y = load_all_resale_data()
X, y = get_cleaned_normalized_data(X, y)
X_train, X_test, y_train, y_test = get_train_split_data(X, y, 0.2)

# Add a section analyzing PCA for dimensionality reduction
print("\n\n" + "="*50)
print("PCA Dimensionality Reduction Analysis")
print("="*50)

Loading data from /Users/amonsisowath/VSC/Uni/Sing2425/CS3244/Assignment1/scripts/lib/../data/Resale Flat Prices (Based on Approval Date), 1990 - 1999.csv...
Loading data from /Users/amonsisowath/VSC/Uni/Sing2425/CS3244/Assignment1/scripts/lib/../data/Resale Flat Prices (Based on Approval Date), 2000 - Feb 2012.csv...
Loading data from /Users/amonsisowath/VSC/Uni/Sing2425/CS3244/Assignment1/scripts/lib/../data/Resale Flat Prices (Based on Registration Date), From Mar 2012 to Dec 2014.csv...
Loading data from /Users/amonsisowath/VSC/Uni/Sing2425/CS3244/Assignment1/scripts/lib/../data/Resale Flat Prices (Based on Registration Date), From Jan 2015 to Dec 2016.csv...
Loading data from /Users/amonsisowath/VSC/Uni/Sing2425/CS3244/Assignment1/scripts/lib/../data/Resale flat prices based on registration date from Jan-2017 onwards.csv...
Combined dataset shape: (948962, 11)
Features shape: (948962, 10)
Target shape: (948962,)
Selected features: month, town, flat_type, block, street_name, storey

In [None]:
# Try different numbers of components
n_components_list = [50, 100, 500, 1000, 1500, 2000, 3000]
pca_train_r2 = []
pca_test_r2 = []
pca_cv_scores = []

for n in n_components_list:
    # Create a pipeline with PCA and Linear Regression
    pca_pipe = Pipeline([
        ('pca', PCA(n_components=n)),
        ('regression', LinearRegression())
    ])
    
    # Train the model
    pca_pipe.fit(X_train, y_train)
    
    # Get predictions
    y_train_pred = pca_pipe.predict(X_train)
    y_test_pred = pca_pipe.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    pca_train_r2.append(train_r2)
    pca_test_r2.append(test_r2)
    
    # Calculate cross-validation score
    cv_score = cross_val_score(pca_pipe, X_train, y_train, cv=5, scoring='r2').mean()
    pca_cv_scores.append(cv_score)
    
    # Get explained variance
    pca = pca_pipe.named_steps['pca']
    explained_variance = sum(pca.explained_variance_ratio_) * 100
    
    print(f"PCA with {n} components:")
    print(f"  Explained variance: {explained_variance:.2f}%")
    print(f"  Training R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  CV R²: {cv_score:.4f}")
    print("-" * 50)

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(n_components_list, pca_train_r2, 'o-', label='Training R²')
plt.plot(n_components_list, pca_test_r2, 'o-', label='Test R²')
plt.plot(n_components_list, pca_cv_scores, 'o-', label='CV R²')
plt.xlabel('Number of PCA Components')
plt.ylabel('R² Score')
plt.title('Model Performance with PCA Dimensionality Reduction')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Find optimal number of components
optimal_idx = np.argmax(pca_test_r2)
optimal_n = n_components_list[optimal_idx]

print(f"Optimal number of PCA components: {optimal_n}")


PCA Values for diffrent number of features:

[50, 100, 500, 1000, 1500, 2000, 3000]
[0.849174028731681, 0.8508046638967771, 0.8543933925998407, 0.8616923092087314, 0.8638630522567969, 0.8665049922760374]

PCA for 50 sufficent. And reduces the feature size by a lot.

In [4]:
# save the 50 pca features
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)
np.save('X_pca.npy', X_pca)