<h3 style='color:violet'>MODEL SELECTION</h3>

In [1]:
# importing dependencies 
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

In [2]:
# importing model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# loading dataset and preview the data
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# check for rows and columns
df.shape

(303, 14)

In [5]:
# check for null values in dataset
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
# checking distribution of target feature
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [7]:
# segregating features and target feature
X = df.drop(columns='target',axis=1)
y = df['target']

In [8]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [9]:
X = np.asarray(X)
y = np.asarray(y)

### Model Selection
#### Comparing model with default hyperparameter value using cross valiation

In [10]:
# list of model
models = [
    LogisticRegression(max_iter=1000),
    SVC(kernel='linear'),KNeighborsClassifier(),
    RandomForestClassifier(random_state=0)
]

In [11]:
def compare_models_cv():
    for model in models:
        cv_score = cross_val_score(model,X,y,cv=5)
        mean_score = round(cv_score.mean()*100,2)
        print(f'Cross Validation accuracy score for the model {model} = {cv_score}')
        print(f'Accuracy score of the model {model} = {mean_score}%')
        print('-'*100)

In [12]:
compare_models_cv()

Cross Validation accuracy score for the model LogisticRegression(max_iter=1000) = [0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
Accuracy score of the model LogisticRegression(max_iter=1000) = 82.83%
----------------------------------------------------------------------------------------------------
Cross Validation accuracy score for the model SVC(kernel='linear') = [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Accuracy score of the model SVC(kernel='linear') = 82.83%
----------------------------------------------------------------------------------------------------
Cross Validation accuracy score for the model KNeighborsClassifier() = [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Accuracy score of the model KNeighborsClassifier() = 64.39%
----------------------------------------------------------------------------------------------------
Cross Validation accuracy score for the model RandomForestClassifier(random_state=0) = [0.85245902 0.90163934 0.819

##### Inference: for the heart disease dataset, **Random Forest Classifier** has highest accuracy value with default hyperparameter

<h3 style='color:violet'>COMPARING MODEL WITH HYPERPARAMETER VALUE WITH GRIDSEARCHCV</h3>

In [13]:
# list of model
models = [
    LogisticRegression(max_iter=10000),
    SVC(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=0)
]

In [18]:
# creating paramters dict
parameter = {
    

    'log_reg_hyperparameters': {
        
        'C' : [1,5,10,20]
    },

    'svc_hyperparameters': {
        
        'kernel' : ['linear','poly','rbf','sigmoid'],
        'C' : [1,5,10,20]
    },


    'KNN_hyperparameters' : {
        
        'n_neighbors' : [3,5,10]
    },


    'random_forest_hyperparameters' : {
        
        'n_estimators' : [10, 20, 50, 100]
    }
}

In [19]:
model_keys = list(parameter)        

In [29]:
result = []
def modelSelection(list_model,hyperparameters):
   
    i = 0
    for model in models:
        key = model_keys[i]
        params = parameter[key]
        i += 1
        print(model)
        print(params)
        print('-'*80)
        clf = GridSearchCV(model,params,cv=5)
        clf.fit(X,y)
        result.append({
            'Model used':model,
            'Highest score':clf.best_score_,
            'Best parameters':clf.best_params_
        })

In [30]:
modelSelection(models,parameter)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
--------------------------------------------------------------------------------
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
--------------------------------------------------------------------------------
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
--------------------------------------------------------------------------------
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
--------------------------------------------------------------------------------


In [31]:
result

[{'Model used': LogisticRegression(max_iter=10000),
  'Highest score': 0.8348633879781421,
  'Best parameters': {'C': 5}},
 {'Model used': SVC(),
  'Highest score': 0.8283060109289618,
  'Best parameters': {'C': 1, 'kernel': 'linear'}},
 {'Model used': KNeighborsClassifier(),
  'Highest score': 0.643879781420765,
  'Best parameters': {'n_neighbors': 5}},
 {'Model used': RandomForestClassifier(random_state=0),
  'Highest score': 0.838087431693989,
  'Best parameters': {'n_estimators': 100}}]

In [32]:
pd.DataFrame(result)

Unnamed: 0,Model used,Highest score,Best parameters
0,LogisticRegression(max_iter=10000),0.834863,{'C': 5}
1,SVC(),0.828306,"{'C': 1, 'kernel': 'linear'}"
2,KNeighborsClassifier(),0.64388,{'n_neighbors': 5}
3,RandomForestClassifier(random_state=0),0.838087,{'n_estimators': 100}
