# Chapter 10

# Machine Learning Algorithm

## Classification Metrics

### Classification Accuracy

In [3]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
model = LogisticRegression(solver='liblinear')
scoring = 'accuracy'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("Accuracy: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

Accuracy: 0.771 (0.051)


### Logarithmic Loss

In [4]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
model = LogisticRegression(solver='liblinear')
scoring = 'neg_log_loss'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("Accuracy: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

Accuracy: -0.494 (0.042)


### Area Under ROC Curve

In [5]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
model = LogisticRegression(solver='liblinear')
scoring = 'roc_auc'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("Accuracy: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

Accuracy: 0.826 (0.050)


### Confusion Matrix

In [6]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train) # fit the model
predicted = model.predict(X_test) # make predictions
matrix = confusion_matrix(Y_test, predicted) # create confusion matrix
print(matrix) # print confusion matrix

[[141  21]
 [ 41  51]]


### Classification Report

In [7]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train) # fit the model
predicted = model.predict(X_test) # make predictions
report = classification_report(Y_test, predicted) # create classification report
print(report) 

              precision    recall  f1-score   support

         0.0       0.77      0.87      0.82       162
         1.0       0.71      0.55      0.62        92

    accuracy                           0.76       254
   macro avg       0.74      0.71      0.72       254
weighted avg       0.75      0.76      0.75       254



## Regression Metrics

### Mean Absolute Error

In [13]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("MAE: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

MAE: -0.337 (0.020)


### Mean Squared Error

In [14]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("MSE: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

MSE: -0.163 (0.020)


### $R^2$ Metric

In [15]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
model = LinearRegression()
scoring = 'r2'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) # evaluate the model
print("R^2: %.3f (%.3f)" % (result.mean(), result.std())) # print the accuracy

R^2: 0.264 (0.102)
