## Dimensionality Reduction

To run the code you need to import the following libraries:

In [None]:
# Data operations
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

### How to prepare a file with .csv extension from a dataset

You can just skip this part and continue to the next section

I just wanted to show you how I prepared/processed the available dataset

In [None]:
# Import the dataset from sklearn library
from sklearn.datasets import load_breast_cancer
# Or you can dowload it from here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [None]:
# Load the dataset
breast = load_breast_cancer()

In [None]:
# Get the features 
breast_data = breast.data

In [None]:
# Check the dimensions of features
breast_data.shape

In [None]:
# Get the labels
breast_labels = breast.target

In [None]:
# Check the dimensions of labels
breast_labels.shape

In [None]:
# Reshape the labels 
labels = np.reshape(breast_labels, (569,1))

In [None]:
# Concatenate the features and their corresponding labels
final_breast_data = np.concatenate([breast_data,labels], axis = 1)

In [None]:
# Check the dimensions
final_breast_data.shape

In [None]:
# Display the feature names
colnames = breast.feature_names
print(colnames)

In [None]:
# Check the shape of it
colnames.shape

In [None]:
# We should add the "label" to the colnames
cols = np.append(colnames,'label')
print(cols)

In [None]:
type(final_breast_data)

In [None]:
# Create a dataframe from the concatenated array
breast_dataset = pd.DataFrame(final_breast_data)
breast_dataset.columns = cols

# String operations (optional)
breast_dataset.columns = breast_dataset.columns.str.strip().str.replace(" ", "_")

breast_dataset.head()

In [None]:
# Replace the binary labels to strings
breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)
breast_dataset.tail()

In [None]:
# Export the dataframe as a .csv file
export_csv = breast_dataset.to_csv ('breast_cancer.csv', index = None, sep = ";", header=True)

<br>

### Principal Component Analysis with Breast Cancer Dataset

The Breast Cancer dataset:
<ul>
<li> multivariate data</li>
<li> 2 classes: 0 ---> benign, 1 ---> malignant</li>
<li> 212 malignant samples versus 357 benign samples</li>
<li> 30 features (data) and their labels (targets)</li>
<li> "clean" dataset</li>
</ul>
   

In [None]:
# Import the content of the breast.csv file into a dataFrame
df_breast = pd.read_csv('breast_cancer.csv', index_col = None, sep = ";")
df_breast.head()

In [None]:
# Let's make sure that we see all the columns
pd.set_option('max_columns', 30)

In [None]:
# Check the descriptive stats
df_breast.describe()

In [None]:
# Let's visualize some of the features and their relations using seaborn
#sns.set(font_scale = 1)
sns.pairplot(df_breast, vars = ["mean_radius", "mean_texture", "mean_perimeter", "mean_area", "mean_smoothness"], hue = "label");

In [None]:
# You can always plot all the variables using the following line but it takes time, plus it is not easy to interpret the figure...

#sns_plot = sns.pairplot(df_breast, kind='scatter', hue='label');

Just in case you are curious about it, I have already done that:

<img src="sns30x30.png">

In [None]:
# Anyway let's go back to the stats
df_breast.describe()

<br>

As you can see the features have different ranges, so what should we do? <b>Normalization

In [None]:
# Get the feature data (you don't need the labels because PCA is unsupervised)
feats = df_breast.iloc[:, :-1]

In [None]:
# Check the shape
feats.shape

In [None]:
# Import the standard scaler tool for z-score normalization
from sklearn.preprocessing import StandardScaler

In [None]:
# Update the values with normalized ones
norm_feats = StandardScaler().fit_transform(feats) # normalizing the features

In [None]:
# Check if the mean == 0 and std == 1
print("mean = ", np.mean(norm_feats), ", std = ", np.std(norm_feats))

In [None]:
# Let's create a dataframe to display the normalized features:
norm_df = pd.DataFrame(norm_feats,columns=df_breast.columns[0:30])

In [None]:
# Display the normalized dataframe
norm_df.tail()

In [None]:
# Check the descriptive stats
norm_df.describe()

### Principal Component Analysis (PCA)

In [None]:
# Import PCA from sklearn
from sklearn.decomposition import PCA

In [None]:
# Let's reduce the dimensions to 2 to better visualize it 
pca_2d = PCA(n_components = 2)
pca_breast = pca_2d.fit_transform(norm_feats)

In [None]:
# Create a dataframe to display the components
pca_df = pd.DataFrame(data = pca_breast, columns = ['principo1', 'principo2'])

In [None]:
# Display the tail of it
pca_df.tail()

In [None]:
# Display how much the new components represent the original data
print('Explained variation per principal component: {}'.format(pca_2d.explained_variance_ratio_))

In [None]:
# Plot the result

plt.figure();
plt.figure(figsize = (8,8));
plt.xticks(fontsize = 12);
plt.yticks(fontsize = 12);
plt.xlabel('Principal Component - 1', fontsize = 14);
plt.ylabel('Principal Component - 2', fontsize = 14);
plt.title("Principal Component Analysis of Breast Cancer Dataset", fontsize = 14);

plt.scatter(pca_df.principo1, pca_df.principo2);

In [None]:
# Specify the font size for matplotlib and seaborn libraries
plt.rc("font", size = 14)
sns.set(font_scale = 1.25)

In [None]:
# plotting two of them together
principo = pca_df
principo['labels'] = df_breast.label
sns.pairplot(pca_df, height = 4, vars = ["principo1", "principo2"], hue = "labels");

In [None]:
# Plot the result with class identifiers: Do not forget that our main motivation is to visualize what we have and 
# get a general understanding about the shape of it, we don't classify or cluster it (just a reminder)
plt.figure()
plt.figure(figsize = (8,8))
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('Principal Component - 1')
plt.ylabel('Principal Component - 2')
plt.title("Principal Component Analysis of Breast Cancer Dataset", fontsize = 16)

targets = ['Benign', 'Malignant']
colors = ['g', 'r']

for target, color in zip(targets, colors):
    i = df_breast['label'] == target
    plt.scatter(pca_df.loc[i, 'principo1'], pca_df.loc[i, 'principo2'], c = color)

plt.legend(targets, prop = {'size': 14});

In [None]:
# How to decide the number of dimensions for reduction?

dim = range(1, len(df_breast.columns))
exp_var_ratio = np.zeros(len(dim))

for i in dim: 
    pca_i = PCA(n_components = i, random_state = 1)
    pca_x = pca_i.fit_transform(norm_feats)
    exp_var_ratio[i-1] = pca_i.explained_variance_ratio_.sum()
    print('Explained Variance Ratio for d = ' + str(i) + '  : ' + str(pca_i.explained_variance_ratio_.sum()))

In [None]:
# Let's visualize it

plt.figure(figsize = (12,7))

plt.plot(dim, exp_var_ratio, marker = 'o', label = 'exp var ratio')
plt.plot(dim, np.ones(len(dim))*0.9, color = 'r', label = 'limit')

plt.grid(True)

plt.xlim([np.min(dim)-1, np.max(dim)+1])
plt.xticks(np.arange(np.min(dim)-1, np.max(dim)+1, 1.0));
plt.xlabel('PCA - Dimension number', labelpad = 12)
plt.ylabel('Cumulative Sum of Explained Variance Ratio', labelpad = 12);
plt.legend(loc = 'lower right', fontsize = 16);

<br>

### Linear Discriminant Analysis with Iris Dataset

Iris dataset contains data about 3 different species as class labels:
<ul>
    <li> Setosa</li>
    <li> Versicolour</li>
    <li> Virginica</li>
</ul>
And their petal and sepal lengths/widths as features.
<img src="iris_petal_sepal.png">

In [None]:
# Import the dataset from sklearn library
from sklearn.datasets import load_iris

In [None]:
# Import iris data from the library and create a dataframe with it
iris = load_iris()
df_iris = pd.DataFrame(data = iris['data'], columns = iris['feature_names'])
df_iris['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df_iris.head()

In [None]:
# Rename the columns
df_iris.rename(columns={'sepal length (cm)':'sepal_length',
                    'sepal width (cm)':'sepal_width',
                    'petal length (cm)':'petal_length',
                    'petal width (cm)':'petal_width'}, 
                 inplace=True)

In [None]:
# Check the new column names
df_iris.columns

In [None]:
# Display dataset properties by classes 
labelGroups = df_iris.groupby('species')

for name, group in labelGroups:
    # print the name of the group
    print("\n\n", name)
    # print data for that group
    print(group.describe())

In [None]:
# Plot all the features to see their pairwise relationships 
sns.pairplot(df_iris, kind = 'scatter', hue = 'species');

### Linear Discriminant Analysis (LDA)

In [None]:
# Import the lda module from sklearn library
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
# Prepare the data to be fed to LDA
# First separate the features
x = df_iris.iloc[:,0:4]
# Then the labels, because LDA is a supervised method
y = df_iris['species']

# Let's start with 2 components
lda = LDA(n_components=2).fit_transform(x,y)

In [None]:
# Print the dataset after LDA 
lda_df = pd.DataFrame({'Feature1':lda[:,0], 'Feature2':lda[:,1], 'species':df_iris['species']})
lda_df.head()

In [None]:
# Let's see the reduced features
sns.pairplot(lda_df, kind = 'scatter', hue = 'species');

In [None]:
# How to decide the number of dimensions for reduction?

dim = range(1, len(df_iris.columns))
exp_var_ratio = np.zeros(len(dim))

for i in dim: 
    lda_i = LDA(n_components = i)
    lda_x = lda_i.fit_transform(x,y)
    exp_var_ratio[i-1] = lda_i.explained_variance_ratio_.sum()
    print('Explained Variance Ratio for d = ' + str(i) + '  : ' + str(lda_i.explained_variance_ratio_.sum()))

<br>
Extra exercises: 

- We didn't perform any normalization on the iris dataset, what would we have if we had normalized it before LDA?
- Feed the iris dataset to PCA and compare it with the LDA results.
- Let's go back to the breast cancer dataset and PCA, what would we have if we didn't normalize it? The normalization, does it have any effect on the PCA performed in this case?

In [None]:
# Now it's your turn
# ...