# Chapter 9

# Evaluate the Performance of Machine Learning Algorithms with Resampling

## Split into Train and Test Sets

In [1]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train) # fit the model
result = model.score(X_test, Y_test) # evaluate the model
print("Accuracy: %.3f%%" % (result*100.0)) # print the accuracy

Accuracy: 75.591%


## K-fold Cross Validation

In [4]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LogisticRegression(solver='liblinear')
result = cross_val_score(model, X, Y, cv=kfold) # evaluate the model
print("Accuracy: %.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100)) # print the accuracy

Accuracy: 77.086% (5.091%)


## Leave One Out Cross Validation

In [5]:
from pandas import read_csv
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
loocv = LeaveOneOut() # create the Leave-One-Out cross-validator
model = LogisticRegression(solver='liblinear')
result = cross_val_score(model, X, Y, cv=loocv) # evaluate the model
print("Accuracy: %.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100)) # print the accuracy

Accuracy: 76.823% (42.196%)


## Repeated Random Test-Train Splits

In [7]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = ShuffleSplit(n_splits=num_folds, random_state=seed)
model = LogisticRegression(solver='liblinear')
result = cross_val_score(model, X, Y, cv=kfold) # evaluate the model
print("Accuracy: %.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100)) # print the accuracy

Accuracy: 78.701% (5.392%)
