In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize, curve_fit

# Goodness of fit

Let's have a look at these data points (this time with uncertainties):

In [None]:
x_data = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
y_data = np.array([2.7, 3.9, 5.5, 5.8, 6.5, 6.3, 6.7, 6.2, 6.0])
yerr_data = np.array([0.3, 0.5, 0.7, 0.6, 0.4, 0.3, 0.7, 0.8, 0.5])

In [None]:
plt.errorbar(x_data, y_data, yerr=yerr_data, fmt="ko")

And fit a line again:

In [None]:
def f_linear(x, a, b):
    return a * x + b

In [None]:
pfit, pcov = curve_fit(f_linear, x_data, y_data, sigma=yerr_data, absolute_sigma=True)

In [None]:
plt.errorbar(x_data, y_data, yerr=yerr_data, fmt="ko")
plt.plot(x_data, f_linear(x_data, *pfit))

That doesn't look very great. How can we quantify the quality of this fit? We look at our $\chi^2$ statistic:

In [None]:
def f_chi2(f, params, x, y, yerr):
    return (((f(x, *params) - y) / yerr) ** 2).sum()

In [None]:
f_chi2(f_linear, pfit, x_data, y_data, yerr_data)

Reminder: As a rule of thumb, the number of degrees of freedom "ndf" (number of data points - number of parameters) should be roughly equal to the $\chi^2$ statistic.

In [None]:
len(x_data) - len(pfit)

So this rule of thumb already indicates this is not a very nice fit. We can be even more quantitative. This rule comes from the fact that the $\chi^2$ statistic actually follows a [Chi-squared distribution](https://en.wikipedia.org/wiki/Chi-squared_distribution) (which has ndf as expectation value) if we assume the data points follow a normal distribution. Using this we can calculate a p-value that answers the question "how often would we get such a high value of $\chi^2$ in repeated experiments, given that our function describes the data".

Scipy provides functions for common probability density functions among which there is also the chi-squared distribution. What we want to calculate is

$$p = \int\limits_{\chi^2_\mathrm{min}}^{\infty}f(\chi^2, \mathrm{ndf})\mathrm{d}\chi^2 = 1 - F(\chi^2_\mathrm{min}, \mathrm{ndf})$$

where $F(\chi^2_\mathrm{min}, \mathrm{ndf})$ is the cumulative distribution function of a chi-squared distribution which we can calculate using `scipy.stats.chi2.cdf`:

In [None]:
import scipy.stats

In [None]:
def chi2_pvalue(chi2, ndf):
    return 1 - scipy.stats.chi2.cdf(chi2, ndf)

In [None]:
chi2_pvalue(
    f_chi2(f_linear, pfit, x_data, y_data, yerr_data),
    len(x_data) - len(pfit)
)

which is rather low, again indicating a bad fit!

<div class="alert alert-block alert-success">
    <b>Exercise</b> Fit a quadratic function to the data. What is $\chi^2 / \mathrm{ndf}$ now? What is the p-value?
</div>

In [None]:
def f_quadratic(x, a, b, c):
    return a * x ** 2 + b * x + c

In [None]:
pfit_q, pcov_q = curve_fit(f_quadratic, x_data, y_data, sigma=yerr_data, absolute_sigma=True)

In [None]:
plt.errorbar(x_data, y_data, yerr=yerr_data, fmt="ko")
x = np.linspace(1, 9, 100)
plt.plot(x, f_quadratic(x, *pfit_q))

In [None]:
chi2 = f_chi2(f_quadratic, pfit_q, x_data, y_data, yerr_data)
ndf = len(x_data) - 3
chi2, ndf, chi2 / ndf

In [None]:
chi2_pvalue(chi2, ndf)

In [None]:
def BoxMuller(u1, u2):
    """Boxâ€“Muller transform (basic form)"""
    # two random numbers in, two out
    f1 = np.sqrt(-2*np.log(u1))
    f2 = 2*np.pi*u2
    gauss1 = f1 * np.cos(f2)
    gauss2 = f1 * np.sin(f2)
    return gauss1, gauss2

In [None]:
u1 = np.random.random(10000)
u2 = np.random.random(10000)
g = BoxMuller(u1, u2)
g = np.concatenate(g)
hist, edges = np.histogram(g, bins=20)
centers = edges[:-1] + 0.5 * (edges[1:] - edges[:-1])
errors = np.sqrt(hist)
plt.errorbar(centers, hist, yerr=errors, fmt="ko")

In [None]:
def gaussian(x, a, sigma, mu):
    return a / (np.sqrt(2 * np.pi) * sigma) * np.exp(- 0.5 * ((x - mu) / sigma) ** 2)

In [None]:
pfit, pcov = curve_fit(gaussian, centers, hist, sigma=errors, absolute_sigma=True, p0=(1000, 1, 0))

In [None]:
plt.errorbar(centers, hist, yerr=errors, fmt="ko")
x = np.linspace(-5, 5, 100)
plt.plot(x, gaussian(x, *pfit))

In [None]:
ndf = len(centers) - 3
chi2 = (((gaussian(centers, *pfit) - hist) / errors) ** 2).sum()
chi2 / ndf

In [None]:
chi2_pvalue(chi2, ndf)

In [None]:
scipy.stats.norm.fit(g)

In [None]:
g.mean(), g.std()