# Chapter 9
# Evaluate the Performance of Machine Learning Algorithms with Resampling

#  9.1 Evaluate Machine Learning Algorithms

**1.Train and Test Sets.**

**2.k-fold Cross Validation.**

**3.Leave One Out Cross Validation**

**4.Repeated Random Test-Train Splits**

In [22]:
import pandas as pd

In [23]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [24]:
data=pd.read_csv('pima-indians-diabetes.csv',names=names)

In [25]:
data

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


#  9.2 Split into Train and Test Sets

In [26]:
import warnings

In [27]:
warnings.filterwarnings('ignore')

In [30]:
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 array = data.values
 X = array[:,0:8]
 Y = array[:,8]
 test_size = 0.33
 seed = 7
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
 random_state=seed)
 model = LogisticRegression()
 model.fit(X_train, Y_train)
 result = model.score(X_test, Y_test)
 print((result*100.0))

78.74015748031496


#  9.3 K-fold Cross Validation

In [38]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

# Assuming array is defined as a NumPy array
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 5
seed = 8

# Corrected KFold initialization
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: ", results.mean() * 100.0)

Accuracy:  76.69213139801376


#  9.4 Leave One Out Cross Validation

In [39]:
from sklearn.model_selection import LeaveOneOut

In [40]:
 X = array[:,0:8]
 Y = array[:,8]
 loocv = LeaveOneOut()
 model = LogisticRegression()
 results = cross_val_score(model, X, Y, cv=loocv)
 print("Accuracy: ", results.mean() * 100.0)

Accuracy:  77.86458333333334


In [41]:
data.shape

(768, 9)

In [42]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

# Assuming array is defined as a NumPy array
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 768
seed = 8

# Corrected KFold initialization
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: ", results.mean() * 100.0)

Accuracy:  77.86458333333334


# 9.5 Repeated Random Test-Train Splits

In [46]:
 from sklearn.model_selection import ShuffleSplit
 from sklearn.model_selection import cross_val_score
 from sklearn.linear_model import LogisticRegression

In [47]:
 array = data.values
 X = array[:,0:8]
 Y = array[:,8]
 n_splits = 10
 test_size = 0.33


In [48]:
 seed = 7
 kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
 model = LogisticRegression()
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results)
 #print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

[0.78740157 0.73622047 0.7992126  0.79133858 0.76377953 0.80708661
 0.72834646 0.79527559 0.76771654 0.72834646]


In [49]:
print(results.mean()*100.0)

77.04724409448819


In [50]:
 seed = 7
 n_splits = 20
 kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
 model = LogisticRegression()
 results = cross_val_score(model, X, Y, cv=kfold)
 print(results)

[0.78740157 0.73622047 0.7992126  0.79133858 0.76377953 0.80708661
 0.72834646 0.79527559 0.76771654 0.72834646 0.77952756 0.7519685
 0.7992126  0.75590551 0.75590551 0.73228346 0.74015748 0.76377953
 0.75590551 0.78346457]


In [51]:
print(results.mean()*100.0)

76.61417322834646
