In [2]:
# Symmetric matrix (like covariance)
import numpy as np 
A = np.array([[2, 0],
              [0, 3]])

eigen_vals, eigen_vecs = np.linalg.eig(A)
print("üîç Eigenvalues:", eigen_vals)
print("üß≠ Eigenvectors:\n", eigen_vecs)


üîç Eigenvalues: [2. 3.]
üß≠ Eigenvectors:
 [[1. 0.]
 [0. 1.]]


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from numpy.linalg import matrix_rank

# 1Ô∏è‚É£ Simulate Data with Redundant Features
np.random.seed(42)
X_orig = np.random.randn(200, 20)
X_redundant = X_orig[:, :10] + 0.01 * np.random.randn(200, 10)  # Add linear dependencies
X = np.hstack([X_orig, X_redundant])
y = np.random.randint(0, 2, 200)

print("Feature count:", X.shape[1])
print("Matrix rank:", matrix_rank(X))  # Check rank before anything

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 2Ô∏è‚É£ Baseline: Logistic Regression (no fix)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_std, y_train)
y_pred = lr.predict(X_test_std)
baseline_acc = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", baseline_acc)

# 3Ô∏è‚É£ PCA: Dimensionality Reduction
pca = PCA(n_components=matrix_rank(X))  # Use rank as optimal number of components
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr_pca = LogisticRegression(max_iter=1000)
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)
pca_acc = accuracy_score(y_test, y_pred_pca)
print("PCA Accuracy:", pca_acc)

# 4Ô∏è‚É£ Variance Threshold (Feature Selection)
sel = VarianceThreshold(threshold=0.01)
X_train_sel = sel.fit_transform(X_train_std)
X_test_sel = sel.transform(X_test_std)

lr_sel = LogisticRegression(max_iter=1000)
lr_sel.fit(X_train_sel, y_train)
y_pred_sel = lr_sel.predict(X_test_sel)
sel_acc = accuracy_score(y_test, y_pred_sel)
print("Feature Selection Accuracy:", sel_acc)

# 5Ô∏è‚É£ Ridge Classifier (Regularized Linear Model)
ridge = RidgeClassifier(alpha=1.0)
ridge.fit(X_train_std, y_train)
y_pred_ridge = ridge.predict(X_test_std)
ridge_acc = accuracy_score(y_test, y_pred_ridge)
print("Ridge Classifier Accuracy:", ridge_acc)

# 6Ô∏è‚É£ Final Report
print("\n‚úÖ Final Model Comparison:")
df = pd.DataFrame({
    'Model': ['Baseline LR', 'PCA + LR', 'Variance Select + LR', 'Ridge Classifier'],
    'Accuracy': [baseline_acc, pca_acc, sel_acc, ridge_acc]
})
print(df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True))


Feature count: 30
Matrix rank: 30
Baseline Accuracy: 0.5
PCA Accuracy: 0.5
Feature Selection Accuracy: 0.5
Ridge Classifier Accuracy: 0.5

‚úÖ Final Model Comparison:
                  Model  Accuracy
0           Baseline LR       0.5
1              PCA + LR       0.5
2  Variance Select + LR       0.5
3      Ridge Classifier       0.5


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from numpy.linalg import matrix_rank

# 1Ô∏è‚É£ Simulate Redundant Data with Learnable Signal
np.random.seed(42)
n_samples, n_features = 200, 30

# 20 original features
X_orig = np.random.randn(n_samples, 20)

# 10 redundant features = linear combos
X_redundant = X_orig[:, :10] + 0.001 * np.random.randn(n_samples, 10)

# Combine
X = np.hstack([X_orig, X_redundant])
true_rank = matrix_rank(X)
print(f"Feature count: {X.shape[1]}")
print(f"üìâ Matrix rank: {true_rank} (out of {X.shape[1]})")

# Create target using linear signal
true_weights = np.random.randn(X.shape[1])
y_score = X @ true_weights + 0.5 * np.random.randn(n_samples)
y = (y_score > np.median(y_score)).astype(int)

# 2Ô∏è‚É£ Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3Ô∏è‚É£ Standardize
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 4Ô∏è‚É£ Baseline Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_std, y_train)
baseline_acc = accuracy_score(y_test, lr.predict(X_test_std))

# 5Ô∏è‚É£ PCA Based on Rank
pca = PCA(n_components=true_rank)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr_pca = LogisticRegression(max_iter=1000)
lr_pca.fit(X_train_pca, y_train)
pca_acc = accuracy_score(y_test, lr_pca.predict(X_test_pca))

# 6Ô∏è‚É£ Ridge Regularization
ridge = RidgeClassifier(alpha=1.0)
ridge.fit(X_train_std, y_train)
ridge_acc = accuracy_score(y_test, ridge.predict(X_test_std))

# 7Ô∏è‚É£ Final Report
df = pd.DataFrame({
    'Model': ['Baseline LR', 'PCA + LR', 'Ridge Classifier'],
    'Accuracy': [baseline_acc, pca_acc, ridge_acc]
})
print("\n‚úÖ Final Model Comparison:")
print(df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True))


Feature count: 30
üìâ Matrix rank: 30 (out of 30)

‚úÖ Final Model Comparison:
              Model  Accuracy
0  Ridge Classifier     0.950
1       Baseline LR     0.875
2          PCA + LR     0.875
