In [None]:
# Phase 0: Environment Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.datasets import load_iris, load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings('ignore')

print("Environment setup complete")

In [None]:
# Phase 1: Load and prepare data
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Original dataset shape: {X_scaled.shape}")
print(f"Features: {feature_names}")
print(f"Classes: {len(np.unique(y))}")

In [None]:
# Phase 2: PCA - Reduce to 2 components
pca = PCA(n_components=2)
X_pca2 = pca.fit_transform(X_scaled)

explained_var = pca.explained_variance_ratio_
cumsum_var = np.cumsum(explained_var)

print(f"Explained variance by each component: {explained_var}")
print(f"Cumulative explained variance: {cumsum_var}")
print(f"Total variance explained by 2 components: {cumsum_var[-1]:.2%}")

# Visualize
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=y, cmap='viridis', s=100, alpha=0.7, edgecolors='black')
plt.xlabel(f'PC1 ({explained_var[0]:.1%})')
plt.ylabel(f'PC2 ({explained_var[1]:.1%})')
plt.title('PCA: Iris Dataset (2 Components)')
plt.colorbar(scatter, label='Class')
plt.grid(alpha=0.3)
plt.show()

print(f"\nPCA components shape: {pca.components_.shape}")

In [None]:
# Phase 3: Analyze explained variance
pca_full = PCA()
pca_full.fit(X_scaled)

cumsum_variance = np.cumsum(pca_full.explained_variance_ratio_)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Individual variance
axes[0].bar(range(1, len(pca_full.explained_variance_ratio_) + 1), 
            pca_full.explained_variance_ratio_, alpha=0.7, color='blue')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Individual Explained Variance')
axes[0].grid(alpha=0.3)

# Cumulative variance
axes[1].plot(range(1, len(cumsum_variance) + 1), cumsum_variance, 'bo-', linewidth=2, markersize=8)
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].grid(alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

n_components_95 = np.argmax(cumsum_variance >= 0.95) + 1
print(f"Components needed for 95% variance: {n_components_95}")

In [None]:
# Phase 4: PCA with 3 components for 3D visualization
pca3 = PCA(n_components=3)
X_pca3 = pca3.fit_transform(X_scaled)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(X_pca3[:, 0], X_pca3[:, 1], X_pca3[:, 2], 
                     c=y, cmap='viridis', s=100, alpha=0.7, edgecolors='black')

ax.set_xlabel(f'PC1 ({pca3.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca3.explained_variance_ratio_[1]:.1%})')
ax.set_zlabel(f'PC3 ({pca3.explained_variance_ratio_[2]:.1%})')
ax.set_title('PCA: 3D Visualization')

plt.colorbar(scatter, ax=ax, label='Class')
plt.show()

print(f"Total variance explained: {np.sum(pca3.explained_variance_ratio_):.2%}")

In [None]:
# Phase 5: Feature Selection - SelectKBest
selector = SelectKBest(f_classif, k=2)
X_selected = selector.fit_transform(X_scaled, y)

# Get feature scores and rankings
scores = selector.scores_
selected_indices = selector.get_support(indices=True)

# Create ranking
feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores,
    'Selected': [i in selected_indices for i in range(len(feature_names))]
}).sort_values('Score', ascending=False)

print("Feature Selection (SelectKBest - Top 2):")
print(feature_ranking)

# Visualize selected features
plt.figure(figsize=(10, 5))
colors = ['green' if x else 'gray' for x in feature_ranking['Selected']]
plt.barh(feature_ranking['Feature'], feature_ranking['Score'], color=colors)
plt.xlabel('F-Score')
plt.title('Feature Selection Scores')
plt.grid(alpha=0.3, axis='x')
plt.show()

In [None]:
# Phase 6: Tree-based Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

importances = rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nRandom Forest Feature Importance:")
print(importance_df)

plt.figure(figsize=(8, 5))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.grid(alpha=0.3, axis='x')
plt.show()

In [None]:
# Phase 7: Comparison - PCA vs Feature Selection
# Load digits dataset for better visualization
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

X_digits_scaled = StandardScaler().fit_transform(X_digits)

# PCA
pca_digits = PCA(n_components=2)
X_pca_digits = pca_digits.fit_transform(X_digits_scaled)

# t-SNE (with reduced perplexity for speed)
tsne = TSNE(n_components=2, random_state=42, n_iter=1000, perplexity=30)
X_tsne = tsne.fit_transform(X_digits_scaled)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

scatter1 = axes[0].scatter(X_pca_digits[:, 0], X_pca_digits[:, 1], 
                           c=y_digits, cmap='tab10', s=50, alpha=0.6)
axes[0].set_title('PCA (2 Components)')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].grid(alpha=0.3)

scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], 
                           c=y_digits, cmap='tab10', s=50, alpha=0.6)
axes[1].set_title('t-SNE (2 Components)')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].grid(alpha=0.3)

plt.colorbar(scatter2, ax=axes[1], label='Digit')
plt.tight_layout()
plt.show()

print(f"Original dimensions: {X_digits.shape[1]}")
print(f"Reduced to 2 dimensions")

In [None]:
# Phase 8: Testing and Validation
test_results = []

# Test 1: PCA variance explained
pca_test = PCA(n_components=2)
pca_test.fit(X_scaled)
variance_explained = np.sum(pca_test.explained_variance_ratio_)
test1 = variance_explained > 0.95
test_results.append(("Test 1: PCA Variance > 95%", test1, f"{variance_explained:.2%}"))

# Test 2: PCA components shape
pca_trans = PCA(n_components=2)
X_trans = pca_trans.fit_transform(X_scaled)
test2 = X_trans.shape == (150, 2)
test_results.append(("Test 2: Transformed shape correct", test2, f"Shape: {X_trans.shape}"))

# Test 3: Feature selection works
selector_test = SelectKBest(f_classif, k=2)
X_sel = selector_test.fit_transform(X_scaled, y)
test3 = X_sel.shape == (150, 2)
test_results.append(("Test 3: Feature selection shape", test3, f"Shape: {X_sel.shape}"))

# Test 4: No NaN values in PCA
pca_full = PCA()
X_full_pca = pca_full.fit_transform(X_scaled)
test4 = not np.isnan(X_full_pca).any()
test_results.append(("Test 4: No NaN in PCA output", test4, "All valid values"))

# Test 5: PCA preserves distances approximately
from sklearn.metrics.pairwise import euclidean_distances
dist_orig = euclidean_distances(X_scaled[:10])
dist_pca = euclidean_distances(X_pca2[:10])
correlation = np.corrcoef(dist_orig.flatten(), dist_pca.flatten())[0, 1]
test5 = correlation > 0.7
test_results.append(("Test 5: Distance correlation > 0.7", test5, f"Correlation: {correlation:.3f}"))

# Print results
print("\n" + "="*60)
print("PRACTICAL 7: DIMENSIONALITY REDUCTION - TEST RESULTS")
print("="*60)
passed = 0
for test_name, result, details in test_results:
    status = "✅ PASS" if result else "❌ FAIL"
    print(f"{status} | {test_name}")
    print(f"       Details: {details}")
    if result:
        passed += 1

print(f"\nTotal: {passed}/{len(test_results)} tests passed")
print("="*60)

In [None]:
# Summary and Reflection
summary = """
KEY LEARNINGS - PRACTICAL 7
===========================

1. DIMENSIONALITY REDUCTION TECHNIQUES:
   - PCA: Linear, interpretable, fast
   - t-SNE: Non-linear, better visualization, slower
   - Feature Selection: Interpretable, uses original features

2. PCA INSIGHTS:
   - Explained variance helps choose component count
   - PC1 + PC2 explain ~95% of iris variance
   - Components are orthogonal (independent)
   - Loadings show original feature contributions

3. FEATURE SELECTION:
   - F-score ranks features by class separability
   - Tree-based importance identifies non-linear relationships
   - Keeps original features for interpretability

4. WHEN TO USE EACH:
   - High-dimensional data (>50 features): PCA
   - Visualization needed: t-SNE or PCA
   - Interpretability critical: Feature selection
   - Performance comparison: Try all three

5. PRACTICAL APPLICATIONS:
   - Image compression with PCA
   - Gene expression analysis with feature selection
   - Data visualization with t-SNE
   - Curse of dimensionality mitigation
"""

print(summary)