### Principle Component Analysis 

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("D:\\Data Science\\datasets\\ML\\Exercises\\10.PrincipleComponentAnalysis\\heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [4]:
df.shape

(918, 12)

In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


<h3>Searching for Outliers</h3>

In [6]:
df.shape

(918, 12)

<h5> Find Outliers Using `RestingBP`</h5>

Find the z-scores for `RestingBP`

In [7]:
df['z_score'] = (df.RestingBP - df.RestingBP.mean())/df.RestingBP.std()
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.410685
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1.49094
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.129442
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.30266
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.950812


get the outliers using z-scores of `RestingBP`

In [8]:
df[(df['z_score']>3) | (df['z_score']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0,3.111322
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1,3.651449
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1,3.651449
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1,3.651449
449,55,M,NAP,0,0,0,Normal,155,N,1.5,Flat,1,-7.151097
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1,3.111322
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1,3.651449
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1,3.219347


Finally Remove the outliers and assign other data to the new dataset(`new_df`) 

In [9]:
new_df = df[(df['z_score']<3) & (df['z_score']>-3)]
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.410685
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1.49094
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.129442
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.30266
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.950812


Here the `new_df` contains only 910 rows, so that 8 rows has been removed 

In [10]:
new_df.shape

(910, 13)

In [11]:
new_df = new_df.drop(columns=['z_score'])
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


<h5> Find Outliers Using `Cholesterol`</h5>

Use the same process to the updated dataset from previous outlier finding process

In [12]:
new_df['z_score'] = (new_df.Cholesterol - new_df.Cholesterol.mean()) / new_df.Cholesterol.std()
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.823475
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.175198
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,0.768502
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.136315
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.037766


In [13]:
new_df[(new_df['z_score']>3) | (new_df['z_score']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1,3.022389
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1,3.700388
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0,3.343064


In [14]:
new_df = new_df[(new_df['z_score']<3) & (new_df['z_score']>-3)]
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.823475
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.175198
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,0.768502
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.136315
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.037766


In [15]:
new_df.shape

(907, 13)

<h5> Find Outliers Using `FastingBS`</h5>

Use the same process to the updated dataset

In [16]:
new_df['z_score'] = (new_df.FastingBS - new_df.FastingBS.mean())/new_df.FastingBS.std()
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-0.548597
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.548597
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.548597
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-0.548597
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.548597


In [17]:
new_df[(new_df['z_score']>3) | (new_df['z_score']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score


There is no outliers are detected with the `FastingBS`

<h5> Find Outliers Using `MaxHR`</h5>

In [18]:
new_df['z_score'] = (new_df.MaxHR - new_df.MaxHR.mean())/new_df.MaxHR.std()
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,1.385705
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.755693
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-1.5281
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-1.134343
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.583082


In [19]:
new_df[(new_df['z_score']>3) | (new_df['z_score']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat,1,-3.024379


In [20]:
new_df = new_df[(new_df['z_score']<3) & (new_df['z_score']>-3)]
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,1.385705
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.755693
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-1.5281
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-1.134343
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.583082


In [21]:
new_df.shape

(906, 13)

<h5> Find Outliers Using `Oldpeak`</h5>

In [22]:
new_df['z_score'] = (new_df.Oldpeak - new_df.Oldpeak.mean())/new_df.Oldpeak.std()
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-0.830801
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.10832
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.830801
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.577881
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.830801


In [23]:
new_df[(new_df['z_score']>3) | (new_df['z_score']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1,3.864805
324,46,M,ASY,100,0,1,ST,133,N,-2.6,Flat,1,-3.272516
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0,3.113508
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1,4.428278
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1,3.113508
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1,4.991751
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1,3.301333


In [24]:
new_df = new_df[(new_df['z_score']<3) & (new_df['z_score']>-3)]
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-0.830801
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,0.10832
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.830801
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.577881
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,-0.830801


In [25]:
new_df.shape

(899, 13)

Finalized Dataset after dropping all the outliers

In [26]:
new_df = new_df.drop(columns=['z_score'])
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Now we can move to find the categories(unique values) that is related to the each column

In [27]:
new_df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [28]:
new_df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [29]:
new_df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [30]:
new_df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [31]:
data_frame = new_df.copy()

data_frame.ExerciseAngina.replace(
    {
        'N' : 0,
        'Y' : 1,
    },
    inplace = True
)

data_frame.ST_Slope.replace(
    {
        'Down' : 1,
        'Flat' : 2,
        'Up' : 3
    },
    inplace = True
)

data_frame.RestingECG.replace(
    {
        'Normal' : 1,
        'ST' : 2,
        'LVH' : 3
    },
    inplace = True
)

data_frame.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [32]:
data_frame = pd.get_dummies(data_frame, drop_first=True)
data_frame.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,True,True,False,False
1,49,160,180,0,1,156,0,1.0,2,1,False,False,True,False
2,37,130,283,0,2,98,0,0.0,3,0,True,True,False,False
3,48,138,214,0,1,108,1,1.5,2,1,False,False,False,False
4,54,150,195,0,1,122,0,0.0,3,0,True,False,True,False


In [33]:
X = data_frame.drop(columns=['HeartDisease'])
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,True,True,False,False
1,49,160,180,0,1,156,0,1.0,2,False,False,True,False
2,37,130,283,0,2,98,0,0.0,3,True,True,False,False
3,48,138,214,0,1,108,1,1.5,2,False,False,False,False
4,54,150,195,0,1,122,0,0.0,3,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,1,132,0,1.2,2,True,False,False,True
914,68,144,193,1,1,141,0,3.4,2,True,False,False,False
915,57,130,131,0,1,115,1,1.2,2,True,False,False,False
916,57,130,236,0,3,174,0,0.0,2,False,True,False,False


In [34]:
y = data_frame.HeartDisease
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 899, dtype: int64

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.46590022,  0.84963584, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 , -0.1185065 ,  0.79361247, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

In [37]:
print("Train dataset size : ",len(X_train))
print("Test dataset size : ",len(X_test))

Train dataset size :  719
Test dataset size :  180


<h5>Using Logistic Regression</h5>

In [38]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train,y_train)
model_lr.score(X_test,y_test)

0.8611111111111112

Use `PCA` to reduce the dimensions

In [39]:
X.shape

(899, 13)

In [40]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(899, 2)

In [41]:
pd.DataFrame(X_pca).head()

Unnamed: 0,0,1
0,93.129128,-29.676707
1,-16.338952,-14.803748
2,82.670263,38.913132
3,14.423616,28.964616
4,-3.570687,17.738479


In [42]:
pca.explained_variance_ratio_

array([0.92111815, 0.05064593])

In [43]:
X_train_pca , X_test_pca, y_train, y_test = train_test_split(X_pca,y,test_size=0.2,random_state=30)

In [44]:
print("Train dataset size : ",len(X_train_pca))
print("Test dataset size : ",len(X_test_pca))

Train dataset size :  719
Test dataset size :  180


In [45]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6666666666666666

<h5> Using Random Forest Classifier</h5>

In [46]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8611111111111112

Use `PCA` to get the model score

In [47]:
model = RandomForestClassifier()
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6388888888888888

<h5>Using Support Vector Machine</h5>

In [48]:
from sklearn.svm import SVC

model_svm = SVC()
model_svm.fit(X_train, y_train)
model_svm.score(X_test, y_test)

0.8666666666666667

In [49]:
model = SVC()
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.7277777777777777

<b><i>Using Support Vector machine, we can achieve the best model score for normal method as well as PCA method</i></b>