In [11]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [3]:
breast_cancer = pd.read_csv('breast-cancer.csv', header=None)
breast_cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


In [4]:
total_cols = ['id', 'outcome', 'time', 'cell_1_radius', 'cell_1_texture', 'cell_1_perimiter',
              'cell_1_area', 'cell_1_smoothness', 'cell_1_compactness', 'cell_1_concavity',
              'cell_1_concave_points', 'cell_1_symmetry', 'cell_1_fractal_dimension',
              'cell_2_radius', 'cell_2_texture', 'cell_2_perimiter', 'cell_2_area',
              'cell_2_smoothness', 'cell_2_compactness', 'cell_2_concavity',
              'cell_2_concave_points', 'cell_2_symmetry', 'cell_2_fractal_dimension', 'cell_3_radius', 
              'cell_3_texture', 'cell_3_perimiter', 'cell_3_area', 'cell_3_smoothness',
              'cell_3_compactness', 'cell_3_concavity', 'cell_3_concave_points', 'cell_3_symmetry',
              'cell_3_fractal_dimension', 'tumor_size', 'lymph_status']
breast_cancer.columns = total_cols

In [5]:
breast_cancer.head(5)

Unnamed: 0,id,outcome,time,cell_1_radius,cell_1_texture,cell_1_perimiter,cell_1_area,cell_1_smoothness,cell_1_compactness,cell_1_concavity,...,cell_3_perimiter,cell_3_area,cell_3_smoothness,cell_3_compactness,cell_3_concavity,cell_3_concave_points,cell_3_symmetry,cell_3_fractal_dimension,tumor_size,lymph_status
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


In [6]:
numerics = breast_cancer._get_numeric_data()

In [7]:
numerics.head(5)

Unnamed: 0,id,time,cell_1_radius,cell_1_texture,cell_1_perimiter,cell_1_area,cell_1_smoothness,cell_1_compactness,cell_1_concavity,cell_1_concave_points,...,cell_3_texture,cell_3_perimiter,cell_3_area,cell_3_smoothness,cell_3_compactness,cell_3_concavity,cell_3_concave_points,cell_3_symmetry,cell_3_fractal_dimension,tumor_size
0,119513,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,...,37.08,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0
1,8423,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0
2,842517,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,0.0818,...,20.98,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5
3,843483,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0
4,843584,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5


In [8]:
numerics = numerics.drop(['id','time'], axis=1)

Before we apply PCA we need to standardize/scale the data. For this we can use sklearn's preprocessing library 


In [9]:
scaler = StandardScaler()
scaled_numerics = scaler.fit_transform(numerics)
scaled_numerics_df = pd.DataFrame(scaled_numerics, columns=numerics.columns)
scaled_numerics_df

Unnamed: 0,cell_1_radius,cell_1_texture,cell_1_perimiter,cell_1_area,cell_1_smoothness,cell_1_compactness,cell_1_concavity,cell_1_concave_points,cell_1_symmetry,cell_1_fractal_dimension,...,cell_3_texture,cell_3_perimiter,cell_3_area,cell_3_smoothness,cell_3_compactness,cell_3_concavity,cell_3_concave_points,cell_3_symmetry,cell_3_fractal_dimension,tumor_size
0,0.192688,1.241770,0.123934,0.122300,-0.623774,-0.784542,-0.676803,-0.480176,-0.228521,0.086480,...,1.156324,-0.022477,0.053105,-1.112648,-1.054732,-0.708404,-1.370815,-0.743009,-0.459226,1.113530
1,0.183175,-2.774630,0.372418,0.088138,1.258416,2.711439,2.043606,1.785222,1.795777,2.216313,...,-2.133937,1.535510,1.050496,0.832831,1.837343,1.589132,1.922113,1.823321,1.329259,0.078903
2,1.254939,-1.127953,1.061610,1.147186,-1.146560,-0.477137,-0.436726,-0.147247,1.481536,-0.360812,...,-1.525864,0.650684,0.930741,-1.144541,-0.123520,-0.550192,0.541923,1.467183,-0.007488,-0.179753
3,-1.900104,-0.442226,-1.747667,-1.662423,3.187841,2.838018,1.209726,0.545246,2.446183,4.810056,...,-0.606256,-1.439240,-1.432374,3.001561,3.064487,1.444778,1.746815,4.540377,3.891002,-0.438410
4,0.912482,-1.850998,0.949089,0.930821,-0.190654,-0.197860,0.593194,0.518612,-0.433143,-0.536682,...,-2.243890,0.411261,0.290905,-0.297096,-0.978914,-0.211827,-0.361191,-1.160505,-0.665207,0.337560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,1.619592,-0.083036,1.502318,1.784892,-2.142495,-1.025242,-1.244751,-0.683780,-0.729114,-1.396647,...,-0.887803,0.754782,0.850333,-2.825307,-1.425871,-1.911738,-1.886056,-1.564662,-1.560160,1.630844
194,-0.625403,2.076770,-0.649648,-0.653766,-0.695827,-0.732303,-0.266256,-0.644421,-0.703536,-0.229255,...,1.889345,-0.962822,-0.814265,-0.761824,-0.788147,-0.803677,-1.113416,-0.426886,-0.501369,-0.697067
195,-0.076838,1.612622,-0.227694,-0.155842,-1.053691,-1.528945,-1.284385,-1.068203,-0.491606,-1.216623,...,1.086354,-0.272310,-0.188116,-0.811941,-1.271179,-1.285242,-1.854547,0.170679,-1.444147,0.441023
196,1.270793,0.131545,1.413238,1.337928,0.345742,1.029752,1.161426,1.332438,-0.159095,0.278969,...,-0.359695,2.010887,1.659537,0.267867,0.445112,0.602907,0.947992,-0.536262,-0.048211,0.078903


In [15]:
#Performing PCA
pca = PCA(svd_solver="full")
data_pca_transformed = pca.fit_transform(scaled_numerics_df)
data_pca_transformed_df = pd.DataFrame(data_pca_transformed, columns=scaled_numerics_df.columns)
data_pca_transformed_df

Unnamed: 0,cell_1_radius,cell_1_texture,cell_1_perimiter,cell_1_area,cell_1_smoothness,cell_1_compactness,cell_1_concavity,cell_1_concave_points,cell_1_symmetry,cell_1_fractal_dimension,...,cell_3_texture,cell_3_perimiter,cell_3_area,cell_3_smoothness,cell_3_compactness,cell_3_concavity,cell_3_concave_points,cell_3_symmetry,cell_3_fractal_dimension,tumor_size
0,-1.703744,-2.165238,0.488978,1.724241,0.301263,-0.490786,-1.162531,0.974638,-0.602151,0.723897,...,0.069410,0.024684,0.010346,-0.153329,-0.123011,0.137100,-0.003857,0.010609,-0.006602,-0.019515
1,6.332841,2.907553,-1.819677,-3.547893,-0.030010,-0.532458,-1.671146,-0.620393,0.048908,0.063960,...,0.074171,0.089382,-0.105479,-0.025129,-0.150297,0.194857,-0.155429,-0.014485,0.036978,0.046194
2,1.264563,-1.303665,-0.867959,-2.579007,2.482939,0.791914,0.379115,-0.063648,0.477577,0.872972,...,0.102315,-0.299693,-0.131688,0.029005,0.160525,0.029223,0.007978,-0.094576,0.023888,0.040502
3,3.341661,11.325238,-0.231527,-1.314330,1.841139,-1.605426,-1.298713,0.619676,1.306645,0.899639,...,0.254424,-0.222721,0.241459,0.136879,0.161654,0.143722,0.033818,0.021981,-0.065306,0.017103
4,1.335361,-1.854887,1.115493,-2.795760,-1.607482,1.645166,0.858404,-0.017929,0.368186,0.281596,...,-0.203760,0.045141,-0.107607,0.009466,-0.033638,0.061518,0.090791,-0.032814,-0.011111,-0.021689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,0.884161,-6.823967,3.155276,-0.902071,1.242468,0.150292,-3.020360,-0.025838,-0.677745,0.607729,...,0.075949,0.044091,-0.273211,0.074897,0.015721,-0.010054,-0.020847,0.003870,-0.086164,0.004886
194,-3.270189,-0.358036,1.023581,2.738340,0.718396,-0.905027,0.671456,0.230449,-0.712380,0.422877,...,0.106400,-0.068638,0.071929,0.009064,0.077483,0.142804,-0.055563,0.015516,0.007572,-0.007297
195,-3.151565,-2.711845,1.078218,1.250803,1.511524,-0.871923,-0.001445,0.907984,1.024890,-0.083441,...,-0.014361,-0.005711,0.089588,-0.124570,-0.143253,0.024445,-0.011533,-0.003825,-0.013539,-0.001283
196,4.691049,-2.113738,-1.212371,-0.249444,-1.058607,0.045708,-0.489672,-0.936954,0.232450,-0.402503,...,-0.059368,-0.114409,-0.007259,0.072679,-0.074755,0.023499,-0.009839,0.028859,-0.010562,-0.025712


In [16]:
# pca.components_
print(pca.explained_variance_ratio_)

[3.15699591e-01 2.64600012e-01 1.08277593e-01 7.17227583e-02
 4.62570514e-02 4.02390372e-02 3.22695836e-02 2.88296553e-02
 1.88891108e-02 1.47266856e-02 1.20279612e-02 1.06387612e-02
 7.94769849e-03 6.74779779e-03 3.91295909e-03 3.42910920e-03
 2.57385074e-03 2.39134578e-03 1.68904301e-03 1.52903859e-03
 1.23421351e-03 1.00298079e-03 8.66466172e-04 7.29439309e-04
 5.97212972e-04 4.59046198e-04 3.99753830e-04 1.86465461e-04
 8.11868065e-05 3.30303553e-05 1.15612825e-05]
