<a href="https://colab.research.google.com/github/Akif-Mufti/Machine-Learning-with-Python/blob/master/LearningML12Performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Machine Learning Algorithm
# Performance Metrics

# Classification accuracy
# is the number of correct predictions made as a ratio of all predictions
# made. This is the most common evaluation metric for classication problems, it is also the most
# misused. It is really only suitable when there are an equal number of observations in each class
# (which is rarely the case) and that all predictions and prediction errors are equally important,
# which is often not the case.

# Cross Validation Classification Accuracy
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.770 (0.048)




In [5]:
# Logarithmic loss (or logloss) is a performance metric for evaluating the predictions of probabilities
# of membership to a given class. The scalar probability between 0 and 1 can be seen as a measure
# of condence for a prediction by an algorithm. Predictions that are correct or incorrect are
# rewarded or punished proportionally to the condence of the prediction.

# Cross Validation Classification LogLoss
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))

Logloss: -0.493 (0.047)




In [8]:
# Area under ROC Curve (or AUC for short) is a performance metric for binary classication
# problems. The AUC represents a model's ability to discriminate between positive and negative
# classes. An area of 1.0 represents a model that made all predictions perfectly. An area of
# 0.5 represents a model that is as good as random. ROC can be broken down into sensitivity
# and specicity. A binary classication problem is really a trade-o between sensitivity and
# specicity.

# Cross Validation Classification ROC AUC
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'roc_auc'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))

AUC: 0.824 (0.041)




In [9]:
# The confusion matrix is a handy presentation of the accuracy of a model with two or more
# classes. The table presents predictions on the x-axis and accuracy outcomes on the y-axis. The
# cells of the table are the number of predictions made by a machine learning algorithm. For
# example, a machine learning algorithm can predict 0 or 1 and each prediction may actually have
# been a 0 or 1. Predictions for 0 that were actually 0 appear in the cell for prediction = 0 and
# actual = 0, whereas predictions for 0 that were actually 1 appear in the cell for prediction = 0
# and actual = 1. And so on.

# Cross Validation Classification Confusion Matrix
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)

[[141  21]
 [ 41  51]]




In [10]:
# Classication Report
# The scikit-learn library provides a convenience report when working on classication prob-
# lems to give you a quick idea of the accuracy of a model using a number of measures. The
# classification report() function displays the precision, recall, F1-score and support for each class

# Cross Validation Classification Report
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)

              precision    recall  f1-score   support

         0.0       0.77      0.87      0.82       162
         1.0       0.71      0.55      0.62        92

    accuracy                           0.76       254
   macro avg       0.74      0.71      0.72       254
weighted avg       0.75      0.76      0.75       254





In [12]:
#regression Metrics
#  Mean Absolute Error.
#  Mean Squared Error.
#  R2.

# The Mean Absolute Error (or MAE) is the sum of the absolute dierences between predictions
# and actual values. It gives an idea of how wrong the predictions were. The measure gives an
# idea of the magnitude of the error, but no idea of the direction (e.g. over or under predicting).

# Cross Validation Regression MAE
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
'B', 'LSTAT', 'MEDV']
dataframe = read_csv(filename, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
kfold = KFold(n_splits=10, random_state=7)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MAE: %.3f (%.3f)" % (results.mean(), results.std()))
# A value of 0 indicates no error or perfect predictions.

MAE: -4.005 (2.084)


In [13]:
# Mean Squared Error
# The Mean Squared Error (or MSE) is much like the mean absolute error in that it provides a
# gross idea of the magnitude of error. Taking the square root of the mean squared error converts
# the units back to the original units of the output variable and can be meaningful for description
# and presentation. This is called the Root Mean Squared Error (or RMSE).

# Cross Validation Regression MSE
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
'B', 'LSTAT', 'MEDV']
dataframe = read_csv(filename, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MSE: %.3f (%.3f)" % (results.mean(), results.std()))

MSE: -34.705 (45.574)


In [0]:
# R2 Metric
# The R2 (or R Squared) metric provides an indication of the goodness of t of a set of predictions
# to the actual values. In statistical literature this measure is called the coecient of determination.
# This is a value between 0 and 1 for no-t and perfect t respectively. The example below
# provides a demonstration of calculating the mean R2 for a set of predictions.

In [14]:
# Cross Validation Regression R^2
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
'B', 'LSTAT', 'MEDV']
dataframe = read_csv(filename, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
kfold = KFold(n_splits=10, random_state=7)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("R^2: %.3f (%.3f)" % (results.mean(), results.std()))

R^2: 0.203 (0.595)
