# Principal Component Analysis (PCA) for Data Visualization

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import random
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

## Load Bird Dataset into Pandas DataFrame

In [None]:
url = "https://alexandra-sandbox-bucket.s3.amazonaws.com/birdData_reduced.csv"

In [None]:
# create Pandas DataFrame using read_csv
df = 


In [None]:
# drop rows with null values from our DataFrame using dropna



In [None]:
# let's peek at our DataFrame head



Some of our attributes may be too highly correlated to add value to our Principal Component Analysis. Let's look at the correlation between our variables.

In [None]:
# Check DataFrame correlation



## Standardize the Data
 

PCA yields a feature subspaces that maximizes variance along axes. Therefore, when working with data that was measured on different scales, we need to standardize it. This dataset has lengths in centimeters, mass, and time periods of days and years. In this next step, we will transform the data onto unit scale with `mean=0` and `variance=1`. 

In [None]:
# features will hold all our numeric columns' labels
# Let's exclude WingU_MEAN, EggL_MEAN, and EggW_MEAN due to their high correlation with other attributes
features = ['LengthU_MEAN', 'TailU_MEAN', 'BillU_MEAN', 'TarsusU_MEAN', 'WeightU_MEAN', 'Clutch_MEAN', 'Broods per year', 'Egg_MASS', 'Incubation period', 'Fledging period', 'Age of first breeding', 'Life span']

# use df.loc to grab our features and their values 
x = 


In [None]:
# Do the same for the 'Order' column
y = 


In [None]:
# now we will use StandardScaler to transform x (our features)
x = 


In [None]:
# show new dataframe of standardized values
pd.DataFrame(data = x, columns = features).head()


# PCA Projection to 2-Dimensions

In [None]:
# initialize an instance of PCA with 2 components
pca = 

In [None]:
# transform x using pca
principalComponents = pca.fit_transform(x)

Note how our principal components have been condensed into two columns:

In [None]:
# print principalComponents
print(principalComponents)

Let's make a new DataFrame with these principal components

In [None]:
# create a new DataFrame with these 2 principal components
pcaDF = 


In [None]:
# let's see the head of that DataFrame


In [None]:
# print the unique values from Order to see what they are


All our specimens will belong to one of these orders
Now we'll combine the principal components DataFrame with the Order column to create our final DataFrame

In [None]:
# concatenate the Order column with our pca DataFrame
finalDF = 

# drop NA vallues again, just to be safe 



In [None]:
#let's display the head again 


## Visualize 2D projection
Now we will use your PCA projection to create a 2-Dimensional visualization of our dataset. We will plot the different orders as different colors to see if we find any differences. 

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1) 

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)

targets = ['Accipitriformes', 'Anseriformes', 'Caprimulgiformes', 'Ciconiiformes',
 'Columbiformes', 'Coraciiformes', 'Falconiformes', 'Galliformes',
 'Gaviiformes', 'Gruiformes', 'Charadriiformes', 'Otidiformes',
 'Passeriformes', 'Pelecaniformes', 'Phoenicopteriformes', 'Piciformes',
 'Podicipediformes', 'Procellariiformes', 'Psittaciformes', 'Strigiformes',
 'Suliformes']


for target in targets:
    indicesToKeep = finalDF['Order'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2'])
ax.legend(targets)
ax.grid()

You can see that even with our Principal Component Analysis, there are a lot of similarities and overlaps between these orders of birds. 

Now it's your turn to customize this visualization. Do you think it's possible to get more separation out of our Orders? Perhaps you may remove a few Orders to simplify the visualization. You might even consider refactoring to use Family instead of Order, if you have time. 

In [None]:
# this cell is the same as previous, here's a starting place for your customization:
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1) 

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)

targets = ['Falconiformes', 'Gaviiformes', 'Gruiformes']


for target in targets:
    indicesToKeep = finalDF['Order'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2'])
ax.legend(targets)
ax.grid()

In this example we see that Falconiformes (falcons) are pretty distinct from Gaviifomres (loons) and Gruiformes (crane-like birds). 