In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
heart = pd.read_csv('heart.csv')

In [4]:
heart.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
heart['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
X = heart.drop(columns = 'target', axis = 1)
Y = heart['target']

In [7]:
X = np.asarray(X)
Y = np.asarray(Y)

### Model Selection

### Compare model with default hyparams using Cross Validation

In [11]:
# list of models

models = [LogisticRegression(max_iter = 1000), SVC(kernel = 'linear'), KNeighborsClassifier(), RandomForestClassifier(random_state = 0)]


In [14]:
def compare_models_default():
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv = 3)
        mean_acc = round(sum(cv_score)/len(cv_score)*100, 2)
        
        print('Cross Val Acc in ', model, ' = ', cv_score)
        print('Accuracy Score of ', model, ' = ', mean_acc)
        print(' - - - - - - - - - - - - - - - - - - - - - - - ')

In [15]:
compare_models_default()

Cross Val Acc in  LogisticRegression(max_iter=1000)  =  [0.85148515 0.87128713 0.8019802 ]
Accuracy Score of  LogisticRegression(max_iter=1000)  =  84.16
 - - - - - - - - - - - - - - - - - - - - - - - 
Cross Val Acc in  SVC(kernel='linear')  =  [0.84158416 0.86138614 0.81188119]
Accuracy Score of  SVC(kernel='linear')  =  83.83
 - - - - - - - - - - - - - - - - - - - - - - - 
Cross Val Acc in  KNeighborsClassifier()  =  [0.62376238 0.6039604  0.66336634]
Accuracy Score of  KNeighborsClassifier()  =  63.04
 - - - - - - - - - - - - - - - - - - - - - - - 
Cross Val Acc in  RandomForestClassifier(random_state=0)  =  [0.84158416 0.85148515 0.77227723]
Accuracy Score of  RandomForestClassifier(random_state=0)  =  82.18
 - - - - - - - - - - - - - - - - - - - - - - - 


### Comparing models with different hyparams using GridSearchCV

In [45]:
models = [LogisticRegression(max_iter = 10000), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state = 0)]

In [46]:
model_hyparams = {
    'lr_hyparams':{
        'C':[1, 3, 6, 9]
    },
    'svc_hyparams':{
        'kernel':['linear','poly','rbf','sigmoid'],
        'C':[1, 3, 6, 9]
    },
    'knn_hyparams':{
        'n_neighbors': [2, 4, 6, 9]
    },
    'rf_hyparams':{
        'n_estimators': [10, 20, 50, 100]
    }
}

In [47]:
print(model_hyparams.keys())

dict_keys(['lr_hyparams', 'svc_hyparams', 'knn_hyparams', 'rf_hyparams'])


In [48]:
print(model_hyparams.values())

dict_values([{'C': [1, 3, 6, 9]}, {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 3, 6, 9]}, {'n_neighbors': [2, 4, 6, 9]}, {'n_estimators': [10, 20, 50, 100]}])


In [49]:
model_keys = list(model_hyparams.keys())
print(model_keys)

['lr_hyparams', 'svc_hyparams', 'knn_hyparams', 'rf_hyparams']


In [50]:
model_hyparams['lr_hyparams']

{'C': [1, 3, 6, 9]}

In [51]:
model_hyparams['svc_hyparams']

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 3, 6, 9]}

In [52]:
model_keys[1]

'svc_hyparams'

In [53]:
# converting the keys into list - model_keys

In [54]:
model_hyparams[model_keys[2]]

{'n_neighbors': [2, 4, 6, 9]}

#### Applying GridSearchCV

In [55]:
def model_selection(models_list, hyp_dict):
    result = []
    
    i = 0
    for model in models_list:
        key = model_keys[i]
        
        params = hyp_dict[key]
        i +=1
        print(model)
        print(params)
        print('- - - - - - - - - - - - - -')
        
        classify = GridSearchCV(model, params, cv = 3)
        
        classify.fit(X, Y)
        
        result.append({
            'Model Used':model,
            'Highest Score':classify.best_score_,
            'Best Params': classify.best_params_
        })
        
    result_df = pd.DataFrame(result, columns = ['Model Used', 'Highest Score', 'Best Params'])
        
    return result_df

In [56]:
model_selection(models, model_hyparams)

LogisticRegression(max_iter=10000)
{'C': [1, 3, 6, 9]}
- - - - - - - - - - - - - -
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 3, 6, 9]}
- - - - - - - - - - - - - -
KNeighborsClassifier()
{'n_neighbors': [2, 4, 6, 9]}
- - - - - - - - - - - - - -
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
- - - - - - - - - - - - - -


Unnamed: 0,Model Used,Highest Score,Best Params
0,LogisticRegression(max_iter=10000),0.841584,{'C': 1}
1,SVC(),0.848185,"{'C': 3, 'kernel': 'linear'}"
2,KNeighborsClassifier(),0.633663,{'n_neighbors': 6}
3,RandomForestClassifier(random_state=0),0.821782,{'n_estimators': 100}
