In [30]:
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]    #first principal compoment and c2 is second
c2 = Vt.T[:, 1]

W2 = Vt.T[:,:2]
X2D = X_centered.dot(W2)



In [49]:
#Using ScikitLearns method
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
pca.explained_variance_ratio_

#to use explained variance ratio
new_pca = PCA()
new_pca.fit(X)
cumsum = np.cumsum(new_pca.explained_variance_ratio_)
print(cumsum)
d = np.argmax(cumsum >= .95) + 1 #this the number of dimensions we should reduce to keep maximum variance

array([0.84248607, 0.14631839])

In [61]:
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X)


In [63]:
#using mnist dataset

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

X = mnist.data
y = mnist.target


pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X)
print(X_reduced.shape)
X_recovered = pca.inverse_transform(X_reduced)
X_recovered.shape

(70000, 154)


(70000, 784)

In [65]:
from sklearn.decomposition import IncrementalPCA

X_train, y_train = mnist.data[:40000,:], mnist.target[:40000]
X_val, y_val = mnist.data[40000:50000, :],  mnist.target[40000:50000]
X_test, y_test = mnist.data[50000:, :], mnist.target[50000:]



n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    print(".", end="") # not shown in the book
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

....................................................................................................

In [None]:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [37]:
#to compute preimage error, we first calculate best params by gridsearch, then transform dataset and 
#using fit_inverse_transform=True, we can reverse the transformation and caluclate the mean error
from sklearn.decomposition import KernelPCA
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression(solver="liblinear"))
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf"]#, "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=2)


In [38]:

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
#X_train = [np.arange(5), np.arange(5,10), np.arange(10,15), np.arange(15,20), np.arange(20,25),np.arange(5), np.arange(5,10), np.arange(10,15), np.arange(15,20), np.arange(20,25)]
#y_train = [1,0,1,1,1,0,0,0,1,0]

X_train, y_train = mnist.data[:10000,:], mnist.target[:10000]
grid_search.fit(X_train, y_train)

# Kpca = kernelPCA(n_components = 2, kernel = 'rbf', gamma = 0.0433, fit_inverse_transform = True)
# X_reduced = kpca.fit_transform(X_train)
# preimage = kpca.inverse_transform(X_train)

# from sklearn.metrics import mean_squared_error
# mean_squared_error(X_train, preimage)




GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=2, n_jobs=None,
     random_state=None, remove_zero_eig=False, tol=0)), ('log_reg', LogisticRe...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kpca__gamma': array([0.03   , 0.03222, 0.03444, 0.03667, 0.03889, 0.04111, 0.04333,
       0.04556, 0.04778, 0.05   ]), 'kpca__kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
grid_search.best_params_

{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}

In [6]:
#Question 9

from sklearn.ensemble import RandomForestClassifier
import time


try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

X_train_f, y_train_f = mnist.data[:60000,:], mnist.target[:60000]
X_val_f, y_val_f = mnist.data[40000:50000, :],  mnist.target[40000:50000]
X_test_f, y_test_f = mnist.data[60000:, :], mnist.target[60000:]



Time for Randomforest on all features: 4.0 seconds
Testing score:  0.9492


In [12]:
#Trying with Randomforest classifier

rnd_clf = RandomForestClassifier(n_estimators = 10,  random_state = 42)
t1 = time.time()
rnd_clf.fit(X_train_f, y_train_f)
t2 = time.time()
print("Time for Randomforest on all features: {:.1f} seconds".format(t2 - t1))

print('Testing score: ', rnd_clf.score(X_test_f, y_test_f))


from sklearn.decomposition import PCA
rnd_clf2 = RandomForestClassifier(n_estimators = 10, random_state = 42)

pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train_f)

print('Number of components returned: ', pca.n_components_)
#pca_test = PCA(n_components = pca.n_components_)
#X_test_reduced = pca_test.fit_transform(X_test_f)
X_test_reduced = pca.transform(X_test_f)

t3 = time.time()
rnd_clf2.fit(X_reduced, y_train_f)
t4 = time.time()
print("Time for Randomforest on reduced features: {:.1f} seconds".format(t4 - t3))
print('testing score: ', rnd_clf2.score(X_test_reduced, y_test_f))

#Results: PCA is actually reducing accuracy and increasing time. so lets try another algo

Time for Randomforest on all features: 4.0 seconds
Testing score:  0.9492
Number of components returned:  154
Time for Randomforest on reduced features: 14.2 seconds
testing score:  0.9009


In [11]:
#Trying with MLP
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(random_state = 42)
mlp_clf2 = MLPClassifier(random_state = 42)

t1 = time.time()
mlp_clf.fit(X_train_f, y_train_f)
t2 = time.time()
print("Time for MLP on all features: {:.1f} seconds".format(t2 - t1))

print('Testing score: ', mlp_clf.score(X_test_f, y_test_f))


from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train_f)

print('Number of components returned: ', pca.n_components_)
X_test_reduced = pca.transform(X_test_f)

t3 = time.time()
mlp_clf2.fit(X_reduced, y_train_f)
t4 = time.time()
print("Time for Randomforest on reduced features: {:.1f} seconds".format(t4 - t3))
print('testing score: ', mlp_clf2.score(X_test_reduced, y_test_f))

#Results: PCA is actually reducing accuracy and increasing time. so lets try another algo

Time for MLP on all features: 216.6 seconds
Testing score:  0.9655
Number of components returned:  154
Time for Randomforest on reduced features: 64.1 seconds
testing score:  0.9716


In [22]:
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

NameError: name 'make_swiss_roll' is not defined

array([5, 6, 7, 8, 9])