# Iris dataset Data Visualization using Principal Component Analysis(PCA)

In [None]:
import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn import model_selection
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
iris = datasets.load_iris()

In [None]:
dir(iris)

In [None]:
iris.target_names

In [None]:
iris.feature_names

In [None]:
iris.data.shape

In [None]:
np.unique(iris.target)

In [None]:
data = iris.data.astype(np.float32)
target = iris.target.astype(np.float32)

In [None]:
pd.DataFrame(data = data, columns = iris.feature_names).head()

In [None]:
scaled_data = StandardScaler().fit_transform(data)

In [None]:
pd.DataFrame(data = scaled_data, columns = iris.feature_names).head()

In [None]:
pca = PCA(n_components=2)

In [None]:
principalComponents = pca.fit_transform(scaled_data)

In [None]:
principaldf = pd.DataFrame(data = principalComponents
             , columns = ['Principal component 1', 'Principal component 2'])

In [None]:
principaldf.head()

In [None]:
targetdf = pd.DataFrame(data = iris.target
             , columns = ["Iris Class"])

In [None]:
finaldf = pd.concat([principaldf, targetdf], axis = 1)
finaldf.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 13)
ax.set_ylabel('Principal Component 2', fontsize = 13)
ax.set_title('2D Data Visualization after PCA', fontsize = 15)

targets=np.unique(iris.target)
colors = ['b', 'g', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = finaldf["Iris Class"] == target
    ax.scatter(finaldf.loc[indicesToKeep, 'Principal component 1']
               , finaldf.loc[indicesToKeep, 'Principal component 2']
               , c = color
               , s = 50)
ax.legend(iris.target_names)
ax.grid()

In [None]:
pca.explained_variance_ratio_

After PCA, Dimension of the dataset was reduced from four to two.

72% of the information was retained.

Let us try with another dataset, this time we will use the famous Breast Cancer dataset.
We can load it directly from scikit-learn.

# Breast Cancer Data Visualization using PCA

In [None]:
bcancer=datasets.load_breast_cancer()

In [None]:
dir(bcancer)

Let check how many class do we have in this dataset.

In [None]:
bcancer.target_names

There are **two** target classes in the breast cancer dataset, Malignant and Benign.

**Malignant** means **"Harmful"** whereas **Benign** means **"Not Harmful"**.

In [None]:
bcancer.feature_names

In [None]:
bcancer.data.shape

In the breast cancer dataset, there are 30 features or columns of data.

There are 569 rows of sample data or entries.

In [None]:
data = bcancer.data.astype(np.float32)
target = bcancer.target.astype(np.float32)

In [None]:
np.unique(bcancer.target)

In [None]:
pd.DataFrame(data = data, columns = bcancer.feature_names).head()

In [None]:
scaled_data = StandardScaler().fit_transform(data)

In [None]:
pd.DataFrame(data = scaled_data, columns = bcancer.feature_names).head()

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(scaled_data)
principaldf = pd.DataFrame(data = principalComponents
             , columns = ['Principal component 1', 'Principal component 2'])
principaldf.head()

In [None]:
targetdf = pd.DataFrame(data = bcancer.target
             , columns = ["Breast Cancer Class"])

In [None]:
finaldf = pd.concat([principaldf, targetdf], axis = 1)
finaldf.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 13)
ax.set_ylabel('Principal Component 2', fontsize = 13)
ax.set_title('2D Data Visualization after PCA', fontsize = 15)

targets=np.unique(bcancer.target)
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finaldf["Breast Cancer Class"] == target
    ax.scatter(finaldf.loc[indicesToKeep, 'Principal component 1']
               , finaldf.loc[indicesToKeep, 'Principal component 2']
               , c = color
               , s = 50)
ax.legend(bcancer.target_names)
ax.grid()

In [None]:
pca.explained_variance_ratio_

After PCA, Dimension of the dataset was reduced from **thirty to two**.

**44%** of the information was retained.

Despite the information lost, we can still see that the two classes of Breast Cancer is **clearly separated** by using the provided dataset.