# **K-Fold cross validation**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/content/heart.csv')

In [4]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
# number of rows and columns in the dataset
heart_data.shape

(1025, 14)

In [6]:
# checking for missing values
heart_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [7]:
# checking the distribution of Target Variable
heart_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,526
0,499


1 --> Defective Heart

0 --> Healthy Heart

In [8]:
x=heart_data.drop(columns='target',axis=1)
y=heart_data['target']

In [9]:
print(x)
print(y)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  
0         2   2     3  
1         0   0     3  
2  

**Train Test Split**

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=3)

In [12]:
print(x.shape,x_train.shape,x_test.shape)

(1025, 13) (820, 13) (205, 13)


# **Comparing the performance of the models**

In [14]:
models=[LogisticRegression(max_iter=1000),SVC(kernel='linear'),KNeighborsClassifier(),RandomForestClassifier()]

In [16]:
def compare_models_train_test():
  for model in models:
     # training the model
    model.fit(x_train, y_train)
    test_data_prediction = model.predict(x_test)
    accuracy=accuracy_score(y_test,test_data_prediction)
    print('Accuracy score of the :',model, '=',accuracy)

In [17]:
compare_models_train_test()

Accuracy score of the : LogisticRegression(max_iter=1000) = 0.8390243902439024
Accuracy score of the : SVC(kernel='linear') = 0.824390243902439
Accuracy score of the : KNeighborsClassifier() = 0.7560975609756098
Accuracy score of the : RandomForestClassifier() = 1.0


# **Cross Validation**


Logistic Regression

In [18]:
cv_score_lr=cross_val_score(LogisticRegression(max_iter=1000),x,y,cv=5)
print(cv_score_lr)
mean_accuracy_lr=sum(cv_score_lr)/len(cv_score_lr)
mean_accuracy_lr=mean_accuracy_lr*100
mean_accuracy_lr=round(mean_accuracy_lr,2)
print(mean_accuracy_lr)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[0.88292683 0.85853659 0.87804878 0.82439024 0.80487805]
84.98


Support Vector Classifier

In [19]:
cv_score_svc=cross_val_score(SVC(kernel='linear'),x,y,cv=5)

In [23]:
print(cv_score_svc)
mean_accuracy_svc=sum(cv_score_svc)/len(cv_score_svc)
mean_accuracy_svc=mean_accuracy_svc*100
mean_accuracy_svc=round(mean_accuracy_svc,2) #only 2 digit tak
print(mean_accuracy_svc)

[0.88292683 0.86829268 0.84390244 0.81463415 0.80487805]
84.29


**Creating a Function to compare the models**

In [24]:
# list of models
models =[LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [25]:
def compare_model_cross_validation():
  for model in models:
    cv_score=cross_val_score(model,x,y,cv=5)
    mean_accuracy=sum(cv_score)/len(cv_score)
    mean_acuracy=mean_accuracy*100
    mean_accuracy=round(mean_accuracy,2)

    print('Cross Validation accuracies for ', model, '=  ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy)
    print('----------------------------------------------')

In [26]:
compare_model_cross_validation()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross Validation accuracies for  LogisticRegression(max_iter=1000) =   [0.88292683 0.85853659 0.87804878 0.82439024 0.80487805]
Accuracy % of the  LogisticRegression(max_iter=1000) 0.85
----------------------------------------------
Cross Validation accuracies for  SVC(kernel='linear') =   [0.88292683 0.86829268 0.84390244 0.81463415 0.80487805]
Accuracy % of the  SVC(kernel='linear') 0.84
----------------------------------------------
Cross Validation accuracies for  KNeighborsClassifier() =   [0.76585366 0.74634146 0.76097561 0.71219512 0.75121951]
Accuracy % of the  KNeighborsClassifier() 0.75
----------------------------------------------
Cross Validation accuracies for  RandomForestClassifier() =   [1.         1.         1.         1.         0.98536585]
Accuracy % of the  RandomForestClassifier() 1.0
----------------------------------------------
