To evaluate a regression model we have three metrics:
1. R^2 (R-squared or coefficient of determination
2. Mean Absolute Error (MAE)
3. Mean Squared Error (MSE)

## the R-squared metric

In [1]:
# importing the hungarian chickenpox dataset
import pandas as pd
chickenpox = pd.read_csv('data/hungary_chickenpox.csv')
chickenpox.head()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,130,57,2,178,66,64,11,29,87,68
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,80,50,29,141,48,29,58,53,68,26
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,64,46,4,157,33,33,24,18,62,44
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,63,54,14,107,66,50,25,21,43,31
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,61,49,11,124,63,56,7,47,85,60


In [2]:
# creating a column with the total of each case per day 
chickenpox['total'] = chickenpox.sum(axis = 1)
chickenpox.head()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA,total
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,57,2,178,66,64,11,29,87,68,1807
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,50,29,141,48,29,58,53,68,26,1407
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,46,4,157,33,33,24,18,62,44,1284
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,54,14,107,66,50,25,21,43,31,1255
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,49,11,124,63,56,7,47,85,60,1478


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

np.random.seed(10)

# dropping the total and the date columns
X = chickenpox.drop(['total', 'Date'], axis = 1)
y = chickenpox['total']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.952292070918584

R-squared compares your model predictions with the test label.

In [6]:
y_preds = model.predict(X_test)

In [12]:
# predicting the target
y_preds = model.predict(X_test)
# importing the R quared score function
from sklearn.metrics import r2_score
# comparing the test with the predictions
r2_score(y_test, y_preds)

0.952292070918584

In [13]:
# comparing the test with itself every 
# prediction will be accurate and will have a 
# score of 100%: 1.0
r2_score(y_test, y_test)

1.0

## the mean absolute error (MAE)

In [14]:
y_preds[:5]

array([ 312.18, 1584.71,  425.48,  849.62, 1465.71])

In [16]:
y_test[:5]

252     377
68     2003
519     286
129     748
24     1571
Name: total, dtype: int64

In [27]:
mae = pd.DataFrame({'test values': y_test,
                    'predicted values': y_preds,
                    'difference': (y_test - y_preds).abs()})
mae.head()

Unnamed: 0,test values,predicted values,difference
252,377,312.18,64.82
68,2003,1584.71,418.29
519,286,425.48,139.48
129,748,849.62,101.62
24,1571,1465.71,105.29


In [31]:
mae['difference'].mean()

74.91556962025318

In [32]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

74.91556962025318

In [39]:
squared = np.square(mae['difference'])
squared.mean()

13788.70591772152

In [40]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_preds)

13788.70591772152

In [45]:
from sklearn.model_selection import cross_val_score
np.random.seed(10)
cv_r2 = cross_val_score(model, X, y, cv = 5, scoring = 'r2')

In [46]:
np.mean(cv_r2)

0.9299662823728638