In [None]:
#import numpy as np
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from numpy.polynomial.polynomial import polyfit
from sklearn import linear_model

# Noisy Curve

In [None]:
def noisy_curve(coeffs, x= None, interval=[-2,2], noise=None, nsamples=100):
    "Create a dataset of nsamples in the interval following the linear regression y=a*x+b and adds a gaussian noise on y"
    if x is None: x = np.random.uniform(low=interval[0], high=interval[1], size=nsamples)  
    if noise is None: 
        vnoise = 0 
    else:
        mu, sigma = noise
        vnoise = np.random.normal(loc=mu, scale=sigma, size=nsamples) 
    ncoeff = len(coeffs)
    ex = np.arange(ncoeff)
    y = 0
    for i in range(ncoeff): 
        y = y+ coeffs[i]* x**ex[i]
    return x, y+vnoise

In [None]:
coeffs = [2, 1., 0,1]
x, y = noisy_curve(coeffs, interval = [-3,1.5], noise=[0.,2])

In [None]:
#| label: nparaboloa
#| fig-cap: Parabola with Gaussian noise
fig = go.Figure()
fig.add_scatter(x=x, y=y, mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')
x1 = np.linspace(x.min(),x.max(),num=50)
x1, y1 = noisy_curve(coeffs,x=x1)
fig.add_scatter(x=x1, y=y1, mode="lines",name='Ground Truth')
fig.update_layout(width=800,height=400)
fig.show()

# Loss Function: Mean Square error

In [None]:
def MSE(x,y,coeffs):
    xp,yp = noisy_curve(coeffs,x=x)
    MSE = np.mean((y-yp)**2)
    return MSE

# Stochastic Gradient Descent

In [None]:
def gradMSE(x,y,coeffs):
    xp,yp = noisy_curve(coeffs,x=x)
    ll = -2*(y-yp)
    g = []
    for i in range(len(coeffs)):
        g.append(np.mean(ll*x**i))
    return np.array(g)

def sgd_epoch(x, y, coeffs0, bs=25, eta=1E-3):
    mask = np.arange(x.size)
    np.random.shuffle(mask)
    nb = x.size//bs
    lb = np.mod(x.size,bs)

    n = nb if lb==0 else nb+1
    vec, vec_c = np.zeros(n+1), np.zeros((n+1,len(coeffs0)))
    vec[0], vec_c[0,:] = MSE(x,y,coeffs0), coeffs0
    
    v=0
    for i in range(n):
        m = mask[i*bs:(i+1)*bs] if i<nb else mask[-lb:]
        xx, yy = x[m], y[m]
        gg = gradMSE(xx,yy,vec_c[i,:])
        vec[i+1],vec_c[i+1,:] = MSE(x,y,vec_c[i,:]), vec_c[i,:]-eta*gg
    return vec, vec_c

def sgd(x, y, coeffs0, niter=int(1E3), bs=25, eta=1E-3):
    cc = coeffs0
    for i in range(niter):
        v, vec_cn = sgd_epoch(x, y, cc, bs=bs, eta=eta)
        if i==0: vec, vec_c = v.tolist(), vec_cn
        else: vec, vec_c = vec[:-1]+v.tolist(), np.concatenate((vec_c[:-1,:],vec_cn),axis=0)
        cc = vec_c[-1,:]
    return vec, vec_c

In [None]:
coeffs0 = np.random.normal(scale=1E-2,size=5)
vec, vec_c = sgd(x, y, coeffs0,eta=1E-5,niter=int(1E4))

In [None]:
px.scatter(y=vec)

In [None]:



fig = go.Figure()
fig.add_scatter(x=x, y=y, mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')
x1 = np.linspace(x.min(),x.max(),num=50)
x1, y1 = noisy_curve(vec_c[-1,:],x=x1)
fig.add_scatter(x=x1, y=y1, mode="lines",name='Ground Truth')
fig.update_layout(width=800,height=400)
fig.show()

In [None]:
step = 100
x1 = np.linspace(x.min(),x.max(),num=50)

frames = [go.Frame(data=[go.Scatter(x=x1, y=noisy_curve(vec_c[i*step,:], x= x1)[1],mode='lines')],layout=go.Layout(title_text=f'step:{i*step}, MSE:{vec[i]:.2f}')) for i in range(len(vec)//step)]

buttons = [dict(label="Play",method="animate",
                args=[None, {"frame": {"duration": 100, "redraw": True},
                             "fromcurrent": True, 
                             "transition": {"duration": 300,"easing": "quadratic-in-out"}}]),
           dict(label="Pause",method="animate",
                args=[[None], {"frame": {"duration": 0, "redraw": False},"mode": "immediate","transition": {"duration": 0}}]),
          dict(label="Restart",method="animate",
                args=[None,{"frame": {"duration": 100, "redraw": True}}])]

Fig = go.Figure(
    data=[go.Scatter(x=x1, y= noisy_curve(vec_c[0,:],x=x1)[1],mode='lines',name = 'line',
                     hovertemplate='x:%{x:.2f}'+'<br>y:%{y:.2f}</br><extra></extra>'),
          go.Scatter(x=x, y=y, mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')],
    layout=go.Layout(
        xaxis=dict(range=[x.min()-2, x.max()+2], autorange=False),       
        yaxis=dict(range=[y.min()-2, y.max()+2], autorange=False),
        updatemenus=[dict(
            type="buttons",
            buttons=buttons)]
    ),
    frames= frames
)

Fig.show()

# Overfitting

In [None]:
cc = polyfit(x,y,deg=3)
print (np.array([cc,coeffs]))
print (MSE(x,y,cc),'\n', MSE(x,y,coeffs))

[[2.19030876 1.31982335 0.03581608 0.96047028]
 [2.         1.         0.         1.        ]]
3.81488262607257 
 3.929760213594528


In [None]:
vec_cc = []
mse_t = []
mse_v = []

npoly = 20
ndata = 50
for i in np.arange(1,npoly):
    vec_cc.append(polyfit(x[:ndata],y[:ndata],deg=i))
    mse_t.append(MSE(x[:ndata], y[:ndata],vec_cc[i-1]))
    mse_v.append(MSE(x[ndata:], y[ndata:],vec_cc[i-1]))

In [None]:
fig = go.Figure()
fig.add_scatter(x=np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.update_layout(yaxis_range=[0,10])

In [None]:
fig = go.Figure()
fig.add_scatter(x=x[:ndata], y=y[:ndata], mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')
x1 = np.linspace(x.min(),x.max(),num=50)

poly = [1, 2, 3, 4, 6, 8, 10, 19]
for i,k in enumerate(poly):
    visible = True if k == 0 else 'legendonly'
    x1, y1 = noisy_curve(vec_cc[k-1],x=x1)
    fig.add_scatter(x=x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.update_layout(width=800, height=400, yaxis_range=[y.min(),y.max()])
fig.show()

# More data

In [None]:
nsamples = int(1E3)
xn, yn = noisy_curve(coeffs, interval = [-3,1.5], noise=[0.,2], nsamples=nsamples)

In [None]:
vec_cc = []
mse_t = []
mse_v = []

npoly = 20
ndata = int(0.8*nsamples)

for i in np.arange(1,npoly):
    vec_cc.append(polyfit(xn[:ndata],yn[:ndata],deg=i))
    mse_t.append(MSE(xn[:ndata], yn[:ndata],vec_cc[i-1]))
    mse_v.append(MSE(xn[ndata:], yn[ndata:],vec_cc[i-1]))

In [None]:
fig = go.Figure()
fig.add_scatter(x=np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.update_layout(yaxis_range=[0,10])

In [None]:
fig = go.Figure()
fig.add_scatter(x=xn[:ndata], y=yn[:ndata], mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')
x1 = np.linspace(x.min(),x.max(),num=50)

poly = [1, 2, 3, 4, 6, 8, 10, 19]
for i,k in enumerate(poly):
    visible = True if k == 0 else 'legendonly'
    x1, y1 = noisy_curve(vec_cc[k-1],x=x1)
    fig.add_scatter(x=x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.update_layout(width=800, height=400, yaxis_range=[y.min(),y.max()])
fig.show()

# Regularization

In [None]:
def poly_cond(x, n):
    matx = np.zeros((x.size,n))
    for i,k in enumerate(range(1,n+1)):
        matx[:,i] = x**k
    return matx

In [None]:
#| output: false
vec_cc = []
mse_t = []
mse_v = []

npoly = 20
ndata = 50
for i in np.arange(1,npoly):
    matx = poly_cond(x[:ndata],i)
    reg = linear_model.Ridge(alpha=0.5)
    reg.fit(matx,y[:ndata])
    c = np.insert(reg.coef_,0,reg.intercept_)
    vec_cc.append(c)
    mse_t.append(MSE(x[:ndata], y[:ndata],vec_cc[i-1]))
    mse_v.append(MSE(x[ndata:], y[ndata:],vec_cc[i-1]))


Ill-conditioned matrix (rcond=9.3118e-17): result may not be accurate.


Ill-conditioned matrix (rcond=1.04722e-17): result may not be accurate.


Ill-conditioned matrix (rcond=1.15966e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.25794e-19): result may not be accurate.



In [None]:
fig = go.Figure()
fig.add_scatter(x=np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.update_layout(yaxis_range=[0,10])

In [None]:
fig = go.Figure()
fig.add_scatter(x=x[:ndata], y=y[:ndata], mode="markers", name='data',
                hovertemplate='x:%{x:.2f}'
                +'<br>y:%{y:.2f}</br><extra></extra>')
x1 = np.linspace(x.min(),x.max(),num=50)

poly = [1, 2, 3, 4, 6, 8, 10, 19]
for i,k in enumerate(poly):
    visible = True if k == 0 else 'legendonly'
    x1, y1 = noisy_curve(vec_cc[k-1],x=x1)
    fig.add_scatter(x=x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.update_layout(width=800, height=400, yaxis_range=[y.min(),y.max()])
fig.show()