In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import numpy as np
import pandas as pd

In [4]:
housing = fetch_california_housing()
housing

housing_df = pd.DataFrame(housing['data'],columns = housing['feature_names'])
housing_df['target'] = housing['target']
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
np.random.seed(42)

X = housing_df.drop('target',axis=1)
y = housing_df['target']

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2)


mdl = RandomForestRegressor()
mdl.fit(X_train,y_train)

### 4.2.2 Regression model evaluation metrics

Link - https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

The ones we are going to cover are:
 1. R^2 (Pronounced r-squared) or coefficient of determination
 2. Mean absolute error (MAE)
 3. Mean squared error (MSE)

**R2 Score**

What R2 score does is, compare output to the mean of the target and value can be ranged fromm negative infinity  to 1


In [6]:
y_preds =  mdl.predict(X_test)
r2_score(y_true=y_test,y_pred=y_preds)

0.8066196804802649

**Mean Absolute Error (MAE)**

MAE  is the average of absolute differences between predictions and actual values.

It gives you an idea of How wrong your model predictionns are.

Data and MAE is in **same scale**

In [7]:
# MAE 

y_preds = mdl.predict(X_test)
mae = mean_absolute_error(y_test,y_preds)

mae

0.3265721842781009

**Mean Squared Error (MSE)**

MSE is the mean of the square of the errors between actual and predicted values. MSE in **squared scale**. So you can  use **Root mean squired error**

In [16]:
# MSE


y_preds = mdl.predict(X_test)
mse = mean_squared_error(y_test,y_preds)
mse

0.2534678520824551

### R2 score (using Cross validation)

In [13]:
np.random.seed(42)

cv_score =  cross_val_score(mdl,X,y,cv=5,scoring=None), # If scoring is None,Model's default 
#scoring evaluation metric is used. (in our case, it is r2 score)
cv_score

(array([0.51682354, 0.70280719, 0.74200859, 0.61659773, 0.68247339]),)

In [20]:
np.mean(cv_score)

0.6521420895559876

You can verify it here


In [24]:
np.random.seed(42)
r2 =  cross_val_score(mdl,X,y,cv=5,scoring='r2')
r2

array([0.51682354, 0.70280719, 0.74200859, 0.61659773, 0.68247339])

In [25]:
np.mean(r2)

0.6521420895559876

### MAE (using Cross validation)

In [22]:
np.random.seed(42)
mae =  cross_val_score(mdl,X,y,cv=5,scoring='neg_mean_absolute_error')
mae

array([-0.54255936, -0.40903449, -0.43716367, -0.46911343, -0.47319069])

In [23]:
np.mean(mae)

-0.4662123287693799

### MSE (using Cross validation)

In [26]:
np.random.seed(42)
mse =  cross_val_score(mdl,X,y,cv=5,scoring='neg_mean_squared_error')
mse

array([-0.51906307, -0.34788294, -0.37112854, -0.44980156, -0.4626866 ])

In [28]:
np.mean(mse)

-0.43011254261460774