<h2> Demo: Implementing Common Techniques of Dimensionality Reduction </h2>

In this demo, you will be shown how to implement different techniques of dimensionality reduction.

### Step 1: Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Step 2: Load cancer.csv file

In [2]:
cancer = pd.read_csv('cancer.csv')
cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
cancer.shape

(569, 33)

In [4]:
cancer.diagnosis = cancer.diagnosis.map({'M':1 , 'B':0})

In [5]:
#Here we are dropping two columns i.e.diagnosis & id
x = cancer.drop(['diagnosis','id'], axis=1)
x.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [6]:
#Here we are saving target column i.e.diagnonis separately 
y = cancer.diagnosis
y.head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis, dtype: int64

In [7]:
x.shape

(569, 31)

# Missing Value Ratio

In [8]:
x.isnull().sum()*100/len(x)

radius_mean                  0.0
texture_mean                 0.0
perimeter_mean               0.0
area_mean                    0.0
smoothness_mean              0.0
compactness_mean             0.0
concavity_mean               0.0
concave points_mean          0.0
symmetry_mean                0.0
fractal_dimension_mean       0.0
radius_se                    0.0
texture_se                   0.0
perimeter_se                 0.0
area_se                      0.0
smoothness_se                0.0
compactness_se               0.0
concavity_se                 0.0
concave points_se            0.0
symmetry_se                  0.0
fractal_dimension_se         0.0
radius_worst                 0.0
texture_worst                0.0
perimeter_worst              0.0
area_worst                   0.0
smoothness_worst             0.0
compactness_worst            0.0
concavity_worst              0.0
concave points_worst         0.0
symmetry_worst               0.0
fractal_dimension_worst      0.0
Unnamed: 3

In [10]:
x = x.drop(['Unnamed: 32'], axis=1)
x.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
x.shape

In [None]:
x_orig = x.copy()

# Low Variance Filter

In [None]:
x.var()

In [None]:
x_v = x/x.mean()
x_v.var()

In [None]:
x_v.var()>0.03

In [None]:
x = x.loc[:,x_v.var()>0.03]
x.shape

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 29)
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.feature_importances_



array([0.03272807, 0.00336965, 0.08454138, 0.00311697, 0.0005928 ,
       0.0335063 , 0.07918863, 0.0032675 , 0.00750964, 0.00342417,
       0.08826107, 0.00457534, 0.0023757 , 0.0144662 , 0.00654657,
       0.0023483 , 0.00321462, 0.13215731, 0.0179329 , 0.09091563,
       0.02940968, 0.00409298, 0.01056285, 0.32468271, 0.01257885,
       0.00463417])

In [18]:
rf_feat = pd.Series(rf.feature_importances_, index= x_train.columns)
rf_feat

radius_mean                0.032728
texture_mean               0.003370
perimeter_mean             0.084541
area_mean                  0.003117
compactness_mean           0.000593
concavity_mean             0.033506
concave points_mean        0.079189
radius_se                  0.003267
texture_se                 0.007510
perimeter_se               0.003424
area_se                    0.088261
smoothness_se              0.004575
compactness_se             0.002376
concavity_se               0.014466
concave points_se          0.006547
symmetry_se                0.002348
fractal_dimension_se       0.003215
radius_worst               0.132157
texture_worst              0.017933
perimeter_worst            0.090916
area_worst                 0.029410
compactness_worst          0.004093
concavity_worst            0.010563
concave points_worst       0.324683
symmetry_worst             0.012579
fractal_dimension_worst    0.004634
dtype: float64

In [19]:
x = x.loc[:, rf_feat>0.01]
x.shape

(569, 13)

# High Correlation Filter

In [20]:
x_cor = x.corr()>0.95

In [21]:
x_cor

Unnamed: 0,radius_mean,perimeter_mean,concavity_mean,concave points_mean,area_se,concavity_se,radius_worst,texture_worst,perimeter_worst,area_worst,concavity_worst,concave points_worst,symmetry_worst
radius_mean,True,True,False,False,False,False,True,False,True,False,False,False,False
perimeter_mean,True,True,False,False,False,False,True,False,True,False,False,False,False
concavity_mean,False,False,True,False,False,False,False,False,False,False,False,False,False
concave points_mean,False,False,False,True,False,False,False,False,False,False,False,False,False
area_se,False,False,False,False,True,False,False,False,False,False,False,False,False
concavity_se,False,False,False,False,False,True,False,False,False,False,False,False,False
radius_worst,True,True,False,False,False,False,True,False,True,True,False,False,False
texture_worst,False,False,False,False,False,False,False,True,False,False,False,False,False
perimeter_worst,True,True,False,False,False,False,True,False,True,True,False,False,False
area_worst,False,False,False,False,False,False,True,False,True,True,False,False,False


In [22]:
x_cor['radius_mean']

radius_mean              True
perimeter_mean           True
concavity_mean          False
concave points_mean     False
area_se                 False
concavity_se            False
radius_worst             True
texture_worst           False
perimeter_worst          True
area_worst              False
concavity_worst         False
concave points_worst    False
symmetry_worst          False
Name: radius_mean, dtype: bool

In [24]:
x = x.drop(['perimeter_mean','radius_worst','perimeter_worst'], axis=1)
x.shape

(569, 10)