# Data Preprocessing UCI 303

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('CleavelandHeartUCI.csv')

dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


## Dataset Details

In [3]:
dataset.shape

(303, 14)

In [4]:
dataset.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [5]:
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [6]:
dataset.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca          float64
thal        float64
target        int64
dtype: object

In [7]:
dataset.info

<bound method DataFrame.info of      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  target  
0        3  0.0   6.0  

In [8]:
dataset.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [9]:
dataset.values.any()

True

## Remove Missing Data

In [10]:
dataset.dropna(inplace=True)

In [11]:
dataset.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [12]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [13]:
data = dataset.copy()

In [14]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [15]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [16]:
## For Chi Test
X1 = data.iloc[:,0:13] #independent columns
y1 = data.iloc[:,-1] #target column

##  Independent and Dependent Variables

In [17]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

### Independent variables (also referred to as Features) are the input for a process that is being analyzes

In [18]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0


### Dependent variables are the output of the process.

In [19]:
y

0      0
1      1
2      1
3      0
4      0
      ..
297    1
298    1
299    1
300    1
301    1
Name: target, Length: 297, dtype: int64

### Objective 1 Achieved the objective to identify the significant features based on scores from both methods.Refer feature importance table

## Achieved the objective  2 with best hyper parameter tuning for classification model and 3 Evaluation

## Feature Importance ranking based on SVM Top 11;Remove sex and age  

In [20]:
import random
random.seed(101)
y = data['target'].copy()
x = data.drop(['target','sex','age'], axis=1).copy()
y

xpy=x
ypy=y

### 11 Atttributes chosen for the model evaluation

In [21]:
x

Unnamed: 0,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
297,4,140,241,0,0,123,1,0.2,2,0.0,7.0
298,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,4,130,131,0,0,115,1,1.2,2,1.0,7.0


In [22]:
x = np.array(x, dtype='float32')

In [23]:
y = np.array(y, dtype='float32')

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
std_scaler = StandardScaler()

In [25]:
import random
random.seed(101)
from sklearn.model_selection import train_test_split
(trainX, testX, trainY, testY) = train_test_split(x,y, test_size=0.2, random_state=101)

trainX=std_scaler.fit_transform(trainX)
testX=std_scaler.transform(testX)

In [26]:
trainX

array([[ 0.8494212 ,  0.1900167 ,  0.11595585, ...,  0.66557413,
         0.329969  ,  1.1325955 ],
       [-2.2797935 ,  0.90136725, -0.07595433, ..., -0.9440286 ,
         1.4012382 , -0.9243021 ],
       [-0.19365042,  1.1202443 ,  0.55734926, ..., -0.9440286 ,
         0.329969  , -0.9243021 ],
       ...,
       [ 0.8494212 ,  0.3541745 , -0.2678645 , ..., -0.9440286 ,
        -0.74130017, -0.9243021 ],
       [-0.19365042,  0.1900167 ,  1.0755068 , ..., -0.9440286 ,
        -0.74130017, -0.9243021 ],
       [-1.236722  ,  0.46361306,  0.8835966 , ...,  0.66557413,
        -0.74130017, -0.9243021 ]], dtype=float32)

In [27]:
trainY

array([1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1.,
       0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
       1., 0., 0., 1., 0.

In [28]:
QC = pd.DataFrame(trainX)

In [29]:
QC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.849421,0.190017,0.115956,-0.402200,1.040962,-0.931131,-0.720577,1.516609,0.665574,0.329969,1.132596
1,-2.279794,0.901367,-0.075954,-0.402200,1.040962,1.227814,-0.720577,-0.216450,-0.944029,1.401238,-0.924302
2,-0.193650,1.120244,0.557349,-0.402200,-0.972980,0.973821,-0.720577,-0.909673,-0.944029,0.329969,-0.924302
3,0.849421,1.010806,-0.075954,-0.402200,-0.972980,0.211840,1.387777,0.303468,0.665574,-0.741300,-0.924302
4,-0.193650,-0.630772,-0.632494,-0.402200,-0.972980,0.889156,-0.720577,-0.909673,-0.944029,-0.741300,-0.924302
...,...,...,...,...,...,...,...,...,...,...,...
232,0.849421,1.010806,-0.440584,-0.402200,1.040962,-1.481450,-0.720577,-0.043144,0.665574,2.472507,1.132596
233,-0.193650,1.557998,2.150204,-0.402200,1.040962,0.084843,-0.720577,-0.216450,-0.944029,-0.741300,-0.924302
234,0.849421,0.354174,-0.267864,-0.402200,1.040962,0.465833,-0.720577,-0.909673,-0.944029,-0.741300,-0.924302
235,-0.193650,0.190017,1.075507,2.486326,-0.972980,0.889156,-0.720577,-0.909673,-0.944029,-0.741300,-0.924302


In [30]:
testX

array([[-1.9365042e-01,  4.6361306e-01, -9.7793216e-01, -4.0219983e-01,
         3.3990581e-02, -1.3967859e+00, -7.2057664e-01,  4.3509305e-02,
         6.6557413e-01, -7.4130017e-01, -9.2430210e-01],
       [-2.2797935e+00,  1.3529743e-01, -8.4359503e-01, -4.0219983e-01,
        -9.7298044e-01,  5.5049795e-01, -7.2057664e-01, -2.1644954e-01,
        -9.4402862e-01,  1.4012382e+00, -9.2430210e-01],
       [-1.9365042e-01,  4.6361306e-01,  1.1595585e-01, -4.0219983e-01,
         1.0409616e+00, -1.2681815e-01, -7.2057664e-01,  8.2338578e-01,
         6.6557413e-01,  2.4725072e+00,  1.1325955e+00],
       [ 8.4942120e-01, -1.0685265e+00, -1.8991011e+00, -4.0219983e-01,
        -9.7298044e-01, -1.0157956e+00, -7.2057664e-01,  4.7677404e-01,
         6.6557413e-01, -7.4130017e-01, -9.2430210e-01],
       [-1.2367220e+00,  1.9001670e-01,  3.9191786e-02, -4.0219983e-01,
         1.0409616e+00,  5.0816566e-01, -7.2057664e-01,  3.0346808e-01,
         6.6557413e-01, -7.4130017e-01, -9.2430210e-

In [31]:
parameters = {'kernel':('rbf', 'linear'), 'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

## p2 setting Kernel with RBF with C 100

In [32]:
p2 = {'kernel':('rbf', 'rbf'), 'C':[100.0, 100.0],
              'gamma':[0.1,0.1]}

## p22  Linear Kernel Comparsion on Grid Search CV 11 Features

In [33]:
p22 = {'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

In [34]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC()
clf = GridSearchCV(model, parameters, verbose=2 ,cv=10)

## Original Parameter tuning 

In [35]:
clf.fit(trainX, trainY)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ......................C=1.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=1.0, gamma=1,

[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ....................C=100.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ....................C=100.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ....................C=100.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ...................

[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.9s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.2s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.5s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.8s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.1s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.2s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.3s
[CV] END ................C=1000.0, gamma=0.01, kernel=linear; total time=   0.9s


GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01], 'kernel': ('rbf', 'linear')},
             verbose=2)

In [36]:
svc_best_param = clf.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param)

 Objective 2 to achieve Best parameter for SVM: {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}


In [37]:
import random
random.seed(101)
predict = clf.predict(testX)
print(classification_report(testY,predict))
print(confusion_matrix(testY, predict))

              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88        40
         1.0       0.75      0.75      0.75        20

    accuracy                           0.83        60
   macro avg       0.81      0.81      0.81        60
weighted avg       0.83      0.83      0.83        60

[[35  5]
 [ 5 15]]


In [38]:
svc_accuracy_score = accuracy_score(testY, predict)
print("Best accuracy for SVM:", svc_accuracy_score)

Best accuracy for SVM: 0.8333333333333334


## Linear Kernel Difference of SVM 11 Features

In [44]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC(kernel='linear')
clf22 = GridSearchCV(model, p22, verbose=2 ,cv=10)

In [45]:
clf22.fit(trainX, trainY)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END ...................................C=1.0, gamma=0.1; total time=   0.0s
[CV] END ...................................C=

[CV] END ................................C=1000.0, gamma=0.1; total time=   0.5s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.9s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.2s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.5s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.8s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.2s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.3s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.9s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.5s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.5s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.9s
[CV] END ...................

GridSearchCV(cv=10, estimator=SVC(kernel='linear'),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01]},
             verbose=2)

In [46]:
svc_best_param22 = clf22.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param22)

 Objective 2 to achieve Best parameter for SVM: {'C': 1.0, 'gamma': 1}


In [47]:
import random
random.seed(101)
predict22 = clf22.predict(testX)
print(classification_report(testY,predict22))
print(confusion_matrix(testY, predict22))

              precision    recall  f1-score   support

         0.0       0.87      0.82      0.85        40
         1.0       0.68      0.75      0.71        20

    accuracy                           0.80        60
   macro avg       0.78      0.79      0.78        60
weighted avg       0.81      0.80      0.80        60

[[33  7]
 [ 5 15]]


In [48]:
svc_accuracy_score22 = accuracy_score(testY, predict22)
print("Best accuracy for SVM Linear Kernel with 11 Features:", svc_accuracy_score22)

Best accuracy for SVM Linear Kernel with 11 Features: 0.8


## Feature Importance ranking based on SVM Top 9 ;Remove sex ,age,cp,trestbps

In [49]:
import random
random.seed(101)
y = data['target'].copy()
x = data.drop(['target','sex','age','cp','trestbps'], axis=1).copy()
y

xpy=x
ypy=y

In [50]:
x = np.array(x, dtype='float32')
y = np.array(y, dtype='float32')

In [51]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
std_scaler = StandardScaler()

In [52]:
import random
random.seed(101)
from sklearn.model_selection import train_test_split
(trainX, testX, trainY, testY) = train_test_split(x,y, test_size=0.2, random_state=101)

trainX=std_scaler.fit_transform(trainX)
testX=std_scaler.transform(testX)

In [53]:
QC = pd.DataFrame(trainX)

In [54]:
parameters = {'kernel':('rbf', 'linear'), 'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

In [55]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC()
clf = GridSearchCV(model, parameters, verbose=2 ,cv=10)

In [56]:
clf.fit(trainX, trainY)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ......................C=1.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=1.0, gamma=1,

[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C

[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.2s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.2s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.3s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.2s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.2s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.2s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01], 'kernel': ('rbf', 'linear')},
             verbose=2)

In [57]:
svc_best_param = clf.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param)

 Objective 2 to achieve Best parameter for SVM: {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}


In [58]:
import random
random.seed(101)
predict = clf.predict(testX)
print(classification_report(testY,predict))
print(confusion_matrix(testY, predict))

              precision    recall  f1-score   support

         0.0       0.89      0.85      0.87        40
         1.0       0.73      0.80      0.76        20

    accuracy                           0.83        60
   macro avg       0.81      0.82      0.82        60
weighted avg       0.84      0.83      0.84        60

[[34  6]
 [ 4 16]]


In [59]:
svc_accuracy_score = accuracy_score(testY, predict)
print("Best accuracy for SVM:", svc_accuracy_score)

Best accuracy for SVM: 0.8333333333333334


## p22  Linear Kernel Comparsion on Grid Search CV 9 Features

In [60]:
p22 = {'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

In [61]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC(kernel='linear')
clf22 = GridSearchCV(model, p22, verbose=2 ,cv=10)

In [62]:
clf22.fit(trainX, trainY)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END ...................................C=1.0, gamma=0.1; total time=   0.0s
[CV] END ...................................C=

[CV] END ................................C=1000.0, gamma=0.1; total time=   0.2s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.3s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.4s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.4s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.2s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.2s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.2s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.2s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.3s
[CV] END ...................

GridSearchCV(cv=10, estimator=SVC(kernel='linear'),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01]},
             verbose=2)

In [63]:
svc_best_param22 = clf22.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param22)

 Objective 2 to achieve Best parameter for SVM: {'C': 1.0, 'gamma': 1}


In [64]:
import random
random.seed(101)
predict22 = clf22.predict(testX)
print(classification_report(testY,predict22))
print(confusion_matrix(testY, predict22))

              precision    recall  f1-score   support

         0.0       0.89      0.85      0.87        40
         1.0       0.73      0.80      0.76        20

    accuracy                           0.83        60
   macro avg       0.81      0.82      0.82        60
weighted avg       0.84      0.83      0.84        60

[[34  6]
 [ 4 16]]


In [65]:
svc_accuracy_score22 = accuracy_score(testY, predict22)
print("Best accuracy for SVM Linear Kernel with 9 Features:", svc_accuracy_score22)

Best accuracy for SVM Linear Kernel with 9 Features: 0.8333333333333334


## Feature Importance ranking based on SVM Top 7 ;Remove sex ,age,cp,trestbps,chol,fbs


In [66]:
import random
random.seed(101)
y = data['target'].copy()
x = data.drop(['target','sex','age','cp','trestbps','chol','fbs'], axis=1).copy()
y

xpy=x
ypy=y

In [67]:
x = np.array(x, dtype='float32')
y = np.array(y, dtype='float32')

In [68]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
std_scaler = StandardScaler()

In [69]:
import random
random.seed(101)
from sklearn.model_selection import train_test_split
(trainX, testX, trainY, testY) = train_test_split(x,y, test_size=0.2, random_state=101)

trainX=std_scaler.fit_transform(trainX)
testX=std_scaler.transform(testX)

In [70]:
QC = pd.DataFrame(trainX)

In [71]:
parameters = {'kernel':('rbf', 'linear'), 'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

In [72]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC()
clf = GridSearchCV(model, parameters, verbose=2 ,cv=10)

In [73]:
clf.fit(trainX, trainY)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=1.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ......................C=1.0, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=1.0, gamma=1,

[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ..................C=10.0, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=100.0, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ...................

[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.3s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.0s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END .................C=1000.0, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000.0, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ...................

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01], 'kernel': ('rbf', 'linear')},
             verbose=2)

In [74]:
svc_best_param = clf.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param)

 Objective 2 to achieve Best parameter for SVM: {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}


In [75]:
import random
random.seed(101)
predict = clf.predict(testX)
print(classification_report(testY,predict))
print(confusion_matrix(testY, predict))

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86        40
         1.0       0.71      0.75      0.73        20

    accuracy                           0.82        60
   macro avg       0.79      0.80      0.80        60
weighted avg       0.82      0.82      0.82        60

[[34  6]
 [ 5 15]]


In [76]:
svc_accuracy_score = accuracy_score(testY, predict)
print("Best accuracy for SVM:", svc_accuracy_score)

Best accuracy for SVM: 0.8166666666666667


## Linear Kernel Comparsion on Grid Search CV 7 Features

In [77]:
p22 = {'C':[1.0, 10.0, 100.0, 1000.0],
              'gamma':[1,0.1,0.01]}

In [78]:
import random
random.seed(101)
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = svm.SVC(kernel='linear')
clf22 = GridSearchCV(model, p22, verbose=2 ,cv=10)

In [79]:
model

SVC(kernel='linear')

In [80]:
clf22.fit(trainX, trainY)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END .....................................C=1.0, gamma=1; total time=   0.0s
[CV] END ...................................C=1.0, gamma=0.1; total time=   0.0s
[CV] END ...................................C=

[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.3s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.0s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ................................C=1000.0, gamma=0.1; total time=   0.1s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.1s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.1s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.1s
[CV] END ...............................C=1000.0, gamma=0.01; total time=   0.1s
[CV] END ...................

GridSearchCV(cv=10, estimator=SVC(kernel='linear'),
             param_grid={'C': [1.0, 10.0, 100.0, 1000.0],
                         'gamma': [1, 0.1, 0.01]},
             verbose=2)

In [81]:
svc_best_param22 = clf22.best_params_
print(" Objective 2 to achieve Best parameter for SVM:", svc_best_param22)

 Objective 2 to achieve Best parameter for SVM: {'C': 10.0, 'gamma': 1}


In [82]:
import random
random.seed(101)
predict22 = clf22.predict(testX)
print(classification_report(testY,predict22))
print(confusion_matrix(testY, predict22))

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86        40
         1.0       0.71      0.75      0.73        20

    accuracy                           0.82        60
   macro avg       0.79      0.80      0.80        60
weighted avg       0.82      0.82      0.82        60

[[34  6]
 [ 5 15]]


In [83]:
svc_accuracy_score22 = accuracy_score(testY, predict22)
print("Best accuracy for SVM Linear Kernel with 7 Features:", svc_accuracy_score22)

Best accuracy for SVM Linear Kernel with 7 Features: 0.8166666666666667
