**DS 301: Applied Data Modeling and Predictive Analysis**

**Lecture 18 – PCA**

# PCA with the Iris dataset
Nok Wongpiromsarn, 28 September 2020

**Construct pandas dataframe from iris dataset**

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()

# Create a dataframe such that the columns are the concatenation 
# of the features and target
df = pd.DataFrame(data = np.c_[iris['data'], iris.target_names[iris['target']]], 
                  columns = iris['feature_names'] + ['target'])
print(df.columns)

In [None]:
# Separating out the features and target

features = ['sepal length (cm)',
            'sepal width (cm)',
            'petal length (cm)',
            'petal width (cm)',
           ]
x = df[features]
y = df['target']
x.head(5)

**Standardizing the features**

In [None]:
from sklearn.preprocessing import StandardScaler

x_scaled = pd.DataFrame(data = StandardScaler().fit_transform(x), columns = features)
x_scaled.head(5)

**Projecting to 2D**

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
x2d = pd.DataFrame(data = pca.fit_transform(x_scaled), 
                   columns = ['principal component 1', 
                              'principal component 2'
                             ]
                  )
x2d.head(5)

In [None]:
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

**Constructing the final dataframe by concatenating x2d with the target**

In [None]:
df = pd.concat([x2d, y], axis=1)
df.head(5)

**Visualizing 2D projection**

In [None]:
import matplotlib.pyplot as plt

colors = ['r', 'g', 'b']

for target, color in zip(iris.target_names, ['r', 'g', 'b']):
    target_indices = df['target'] == target
    plt.scatter(df.loc[target_indices, 'principal component 1'],
                df.loc[target_indices, 'principal component 2'],
                c = color, s = 50, label = target)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.xlabel('Principal Component 1', fontsize = 15)
plt.ylabel('Principal Component 2', fontsize = 15)
plt.legend()
plt.grid()
plt.show()