## **Principal Component Analysis (PCA)**

Import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
np.random.seed(1)
X = np.dot(np.random.random(size=(2, 2)), np.random.normal(size=(2, 200))).T
print(X.shape)

plt.plot(X[:, 0], X[:, 1], 'o')
plt.axis('equal')
plt.show()

Let's find the principle components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X)

print(pca.explained_variance_)  # percentage of variance explained by each component : eigenvalues
print(pca.components_)          # principal axes in feature space : eigenvectors

In [None]:
plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.5)
for length, vector in zip(pca.explained_variance_ratio_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    plt.plot([0, v[0]], [0, v[1]], '-k', lw=3)
plt.axis('equal');

Let's see what our data look like if we only keep 95% of the variance:

In [None]:
pca95 = PCA(0.95)     # keep 95% of variance
X_trans = pca95.fit_transform(X)    # fit PCA and transform data

print(X.shape)
print(X_trans.shape)

By keeping 95% of the varaince (throw away 5% of the variance), the data is now compressed by a factor of 50%! 

Let's see what the data look like after this compression:

In [None]:
X_new = pca95.inverse_transform(X_trans)

plt.plot(X[:, 0], X[:, 1], 'o', alpha=0.2)
plt.plot(X_new[:, 0], X_new[:, 1], 'ob', alpha=0.8)
plt.axis('equal');

# **Breast Cancer Wisconsin (Diagnostic) Dataset**
*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)


Import libraries

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt

Prepare data

In [None]:
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, test_size=0.7, random_state=0)

Add random noise

In [None]:
rTrainX = np.concatenate((TrainX, np.random.randn(TrainX.shape[0], 10)), axis=1)
rTestX = np.concatenate((TestX, np.random.randn(TestX.shape[0], 10)), axis=1)
print(rTrainX.shape)

Classification

In [None]:
svmmodel = svm.SVC(kernel="linear", probability=True)
svmmodel.fit(rTrainX, TrainY)

tr_pred = svmmodel.predict(rTrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(rTestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

PCA

In [None]:
pca = PCA(n_components=5)
pca.fit(rTrainX)
TrainX_trans = pca.transform(rTrainX)
TestX_trans = pca.transform(rTestX)

print(TrainX_trans[:5,:])
print(TestX_trans[:5,:])

In [None]:
svmmodel2 = svm.SVC(kernel="linear", probability=True)
svmmodel2.fit(TrainX_trans, TrainY)

tr_pred2 = svmmodel2.predict(TrainX_trans)
tr_acc2 = metrics.accuracy_score(TrainY, tr_pred2)

ts_pred2 = svmmodel2.predict(TestX_trans)
ts_acc2 = metrics.accuracy_score(TestY, ts_pred2)

print('Training Accuracy : ', tr_acc2)
print('Test Accuracy : ', ts_acc2)