### Evaluating a Model with Cross Validation and Scoring Parameter

- Scikit-Learn documentation 3.3 Metrics and Scoring


#### Cross Validation

##### 1. Classification Problem

In [2]:
# Imports
import numpy as np
import pandas as pd


In [3]:
# Import our data 
heart_disease = pd.read_csv("data/heart-disease.csv")


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=100)


In [5]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, X,y, cv=5, scoring=None) # if scoring= None estimator's default scoring evaluation metric is used(accuracy for classification models)
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [6]:
# Cross-validate accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}")

The cross-validated accuracy is: 82.48


In [7]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [8]:
# Cross-validate accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}")

The cross-validated accuracy is: 82.48


In [10]:
# Precision ( using precision parameter as metric, previously used accuracy)
np.random.seed(42)
cv_precision = cross_val_score(clf, X, y, cv=5, scoring="precision")
cv_precision

array([0.82352941, 0.93548387, 0.84848485, 0.79411765, 0.76315789])

In [12]:
# Cross-validate precision
print(f"The cross-validated precision is: {np.mean(cv_precision)}")

The cross-validated precision is: 0.8329547346025924


In [13]:
# Recall 
np.random.seed(42)
cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall")
cv_recall # recall is a  bit more successful

array([0.84848485, 0.87878788, 0.84848485, 0.81818182, 0.87878788])

In [15]:
# Cross-validate recall
print(f"The cross-validated recall is: {np.mean(cv_recall)}")

The cross-validated recall is: 0.8545454545454545


In [16]:
# F1 as a metric
np.random.seed(42)
cv_f1 = cross_val_score(clf, X,y, cv=5, scoring="f1")
cv_f1

array([0.8358209 , 0.90625   , 0.84848485, 0.80597015, 0.81690141])

In [17]:
# Cross-validate F1
print(f"The cross-validated F1 is: {np.mean(cv_f1)}")

The cross-validated F1 is: 0.8426854603423346


##### 2. Scoring Parameter used in Regression Problem

In [19]:
# Importing data from Scikit-Learn
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"]) # missing target column
housing_df["target"] = housing["target"] # now complete
housing_df.head() 

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [20]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis = 1)
y = housing_df["target"]

model = RandomForestRegressor()


In [28]:
np. random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None) # None. default metric for Regressor scoring  is coefficient of determination 
cv_r2 # we took cv=3 for time concern

array([0.62159677, 0.72076221, 0.62136792])

In [29]:
# Cross-validate R2
print(f"The cross validated R2 is: {np.mean(cv_r2)}")

The cross validated R2 is: 0.6545756342466266


In [30]:
np.random.seed(42)
cv_exp_var = cross_val_score(model, X, y, cv=3, scoring="explained_variance")
cv_exp_var

array([0.64617128, 0.72763381, 0.65918398])

In [31]:
# Cross-validate explained_variance
print(f"The cross-validated explained_variance is: {np.mean(cv_exp_var)}")

The cross-validated explained_variance is: 0.6776630222189493


In [39]:
# Mean Squared Error 
np.random.seed(42)
cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
cv_mse

array([-0.51906307, -0.34788294, -0.37112854, -0.44980156, -0.4626866 ])

In [33]:
# Cross-validate  Mean Squared Error 
print(f"The cross-validate mean squared is: {np.mean(cv_mse)}")

The cross-validate mean absolute is: -0.4613720134531419


In [38]:
# Mean Absolute Error
np.random.seed(42)
cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
cv_mae

array([-0.54255936, -0.40903449, -0.43716367, -0.46911343, -0.47319069])

In [37]:
# Cross validate Mean Absolute Error
print(f"The cross-validate mse is: {np.mean(cv_mae)}")

The cross-validate mse is: -0.4835493810852714


#### `Check Documentation, 3.3.1 The Scoring Parameter`