<a href="https://colab.research.google.com/github/BartekPodgorski/Python/blob/main/data-science-training/06_scikit_learn/03_regression_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
y_true = 100 + 20 * np.random.randn(50)
y_true

array([126.53546339,  69.39251745, 111.90342018, 112.20656353,
       120.18040372,  92.83222322,  98.48806043, 112.81667453,
       122.19483739,  82.13765787, 107.47860864,  72.28065195,
        88.65404744,  94.83838493,  77.41756481, 110.1551305 ,
        82.47901391, 111.54097236,  47.45293537,  48.3280406 ,
       126.10763628,  70.8092179 , 104.86572787,  94.63599951,
        98.78580331, 107.46023845,  74.45600462, 138.869637  ,
       109.10957655, 108.25842741,  84.95104836, 111.24795034,
        93.3898378 , 124.85540302,  80.47791505,  59.35182316,
        72.41019863,  79.78216851,  80.39780854,  92.88738896,
        99.80353709,  81.13073779, 106.04848121, 103.31327496,
       118.60874016, 101.52396987,  92.0829811 , 114.85574097,
        96.69710898,  94.68739081])

In [3]:
y_pred = y_true + 10 * np.random.randn(50)
y_pred

array([118.90709202,  66.3848043 , 112.99171826, 110.76740605,
       120.05920586,  93.25640031, 103.8680424 , 107.68996284,
        89.58327014,  61.90556401, 125.74296782,  66.08386553,
        95.21310462,  71.46619577,  78.14332235, 114.14230156,
        70.55933623, 115.14465443,  43.58158431,  68.08094474,
       128.5761433 ,  80.88751289,  94.72473608,  76.76506966,
       100.00833048, 114.02435336,  94.96829289, 158.77587771,
       104.70398171, 109.19193867,  94.00535184, 113.56355778,
        84.13616511, 124.55414804,  95.01075999,  55.45076751,
        63.85028196,  70.87517501,  61.20730835,  97.35175896,
       100.69105662,  88.22299747,  98.9180022 ,  88.42466094,
       112.88324692, 110.04809435,  72.6014743 , 109.62523063,
       104.71690675, 128.51701443])

In [4]:
results = pd.DataFrame(data={'y_true':y_true,'y_pred':y_pred})
results.head()

Unnamed: 0,y_true,y_pred
0,126.535463,118.907092
1,69.392517,66.384804
2,111.90342,112.991718
3,112.206564,110.767406
4,120.180404,120.059206


In [5]:
results['error'] = results.y_true - results.y_pred
results.head()

Unnamed: 0,y_true,y_pred,error
0,126.535463,118.907092,7.628371
1,69.392517,66.384804,3.007713
2,111.90342,112.991718,-1.088298
3,112.206564,110.767406,1.439157
4,120.180404,120.059206,0.121198


In [6]:
def plot_regression_results(y_true, y_pred): 
    results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
    min = results[['y_true', 'y_pred']].min().min()
    max = results[['y_true', 'y_pred']].max().max()

    fig = go.Figure(data=[go.Scatter(x=results['y_true'], y=results['y_pred'], mode='markers'),
                    go.Scatter(x=[min, max], y=[min, max])],
                    layout=go.Layout(showlegend=False, width=800, height=500,
                                     xaxis_title='y_true', 
                                     yaxis_title='y_pred',
                                     title='Regression results'))
    fig.show()
plot_regression_results(y_true, y_pred)

In [8]:
y_true = 100 + 20 * np.random.randn(1000)
y_pred = y_true + 10 * np.random.randn(1000)
results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
results['error'] = results['y_true'] - results['y_pred']

px.histogram(results, x='error', nbins=50, width=800)

In [19]:
def mean_absolute_error(y_true, y_pred):
    return abs(y_true - y_pred).sum() / len(y_true)

mean_absolute_error(y_true, y_pred)

8.273925147052026

In [18]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_pred)

8.273925147052026

In [15]:
def mean_squared_error(y_true, y_pred):
    return ((y_true - y_pred) ** 2).sum() / len(y_true)

mean_squared_error(y_true, y_pred)

108.49512559575727

In [16]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_true, y_pred)

108.49512559575727

In [20]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).sum() / len(y_true))

root_mean_squared_error(y_true, y_pred)

10.416099346480777

In [21]:
np.sqrt(mean_squared_error(y_true, y_pred))

10.416099346480777

In [24]:
def max_error(y_true, y_pred):
    return abs(y_true - y_pred).max()
max_error(y_true, y_pred)

38.732567783221455

In [25]:
from sklearn.metrics import max_error

max_error(y_true, y_pred)

38.732567783221455

In [26]:
def r2_score(y_true, y_pred):
    numerator = ((y_true - y_pred) ** 2).sum()
    denominator = ((y_true - y_true.mean()) ** 2).sum()
    try:
        r2 = 1 - numerator / denominator
    except ZeroDivisionError:
        print('Divide by zero')
    return r2

In [27]:
r2_score(y_true, y_pred)

0.7344457755155217