In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# Use seaborn for plotting defaults
import seaborn as sns; sns.set()

from IPython.html.widgets import interact

##  Bias-Variance Tradeoff

Let us work with a 1-D problem to  help us to easily visualize the data and the model. These results generalize easily to higher-dimensional datasets. Let us start by creating a  simple nonlinear function that we'd like to fit:

In [None]:
def test_func(x, err=0.5):
    y = 10 - 1. / (x + 0.1)
    if err > 0:
        y = np.random.normal(y, err)
    return y

Now let's create a function that samples `N` data points from this function:

In [None]:
def make_data(N=40, error=1.0, random_seed=1):
    np.random.seed(random_seed)
    X = np.random.random(N)[:, np.newaxis]
    y = test_func(X.ravel(), error)
    
    return X, y

Now let us create one dataset and plot it:

In [None]:
X, y = make_data(40, error=1)
plt.scatter(X.ravel(), y)

Let's use a built-in linear regression function to compute the simplest linear fit to this data and then plot the predictions on some `X_test` sample data:

In [None]:
X_test = np.linspace(-0.1, 1.1, 500)[:, None]

model = linear_model.LinearRegression()
model.fit(X, y)
y_test = model.predict(X_test)

plt.ylim([-2,14])
plt.scatter(X.ravel(), y)
plt.plot(X_test.ravel(), y_test, lw=3)
plt.title("mean squared error: {0:.3g}".format(metrics.mean_squared_error(model.predict(X), y)));

Now let us resample and fit this linear model a few times to see the variance of our predicted function:

In [None]:
average_mse = 0.0
n_samples = 40
n_sims = 20

for i in range(n_sims):
    X, y = make_data(n_samples, error=1, random_seed=i)
    model.fit(X, y)
    y_test = model.predict(X_test)
    
    plt.ylim([-2,14])
    plt.scatter(X.ravel(), y, alpha=0.2)
    plt.plot(X_test.ravel(), y_test, alpha=0.4, lw=3)
    average_mse += metrics.mean_squared_error(model.predict(X), y)

plt.title("average mean squared error: {0:.3g}".format(average_mse/n_sims));

Clearly this model is not a good choice. We say that this model is biased, or that it under-fits the data, however the variance of our estimates is quite small, since most of the estimates lie close to each other.

Let's try to improve our model by  creating a more complicated model. We can do this by adding degrees of freedom, and computing a polynomial regression over the inputs:

In [None]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         linear_model.LinearRegression(**kwargs))

Now we'll use this to fit a quadratic curve to the data repeatedly and compute the average of the errors:

In [None]:
model = PolynomialRegression(degree=2)

average_mse = 0.0
n_samples = 40
n_sims = 20

for i in range(n_sims):
    X, y = make_data(n_samples, error=1, random_seed=i)
    model.fit(X, y)
    y_test = model.predict(X_test)
    
    plt.ylim([-2,14])
    plt.scatter(X.ravel(), y, alpha=0.2)
    plt.plot(X_test.ravel(), y_test, alpha=0.4, lw=3)
    average_mse += metrics.mean_squared_error(model.predict(X), y)
    
plt.title("average mean squared error: {0:.3g}".format(average_mse/n_sims));

We see we have reduced the mean square error and this model fits the data well. Perhaps we should just increase the complexity of our model, would that be better?

In [None]:
model = PolynomialRegression(degree=10)

average_mse = 0.0
n_samples = 40
n_sims = 20

for i in range(n_sims):
    X, y = make_data(n_samples, error=1, random_seed=i)
    model.fit(X, y)
    y_test = model.predict(X_test)
    
    plt.ylim([-2,14])
    plt.scatter(X.ravel(), y, alpha=0.2)
    plt.plot(X_test.ravel(), y_test, alpha=0.4, lw=3)
    average_mse += metrics.mean_squared_error(model.predict(X), y)
    
plt.title("average mean squared error: {0:.3g}".format(average_mse/n_sims));

When we increase the degree to this extent, it's clear that the resulting fit is no longer reflecting the true underlying distribution, and is more sensitive to the noise in the training data. We are at a situation called over-fitting, where the bias is very low but the variance of our estimators is very high.

Just for fun, let's use IPython's interact capability to explore this interactively:

In [None]:
def plot_fit(degree=1, Npts=50):
    X, y = make_data(Npts, error=1)
    X_test = np.linspace(-0.1, 1.1, 500)[:, None]
    
    model = PolynomialRegression(degree=degree)
    model.fit(X, y)
    y_test = model.predict(X_test)

    plt.scatter(X.ravel(), y)
    plt.plot(X_test.ravel(), y_test)
    plt.ylim(-4, 14)
    plt.title("mean squared error: {0:.2f}".format(metrics.mean_squared_error(model.predict(X), y)))
    
interact(plot_fit, degree=[1, 30], Npts=[2, 100]);

### Effects of `n_samples`

Now let us investigate the effect of increasing and reducing the sample size (or size of our training data set) for these different situation of under-fitting, to optimal fitting and then overfitting.

For the case with high bias, let us think what will happen. Increasing the number of points will lead to more or less the same straigh line. So plotting the error on data the algorithm has not seen, the error will plateau out after a certain number of data points. The same with the training error. It will start small and will end up close to the test error. So the learning curves will look like below, with high values of the error. So increasing the training data size will not by itself help:

![](../data/lc-hb.png)

Whereas if our algorithm is suffering from high variance, then getting more training data is likely to help:

![](../data/lc-hv.png)

Let us now try to simulate these curves:

In [None]:
def plot_with_err(x, data, **kwargs):
    mu, std = data.mean(1), data.std(1)
    lines = plt.plot(x, mu, '-', **kwargs)
    plt.fill_between(x, mu - std, mu + std, edgecolor='none',
                     facecolor=lines[0].get_color(), alpha=0.2)
    
def rms_error(model, X, y):
    y_pred = model.predict(X)
    return np.sqrt(np.mean((y - y_pred) ** 2))

def plot_learning_curve(degree=3):
    train_sizes = np.linspace(0.05, 1, 20)
    N_train, val_train, val_test = learning_curve(PolynomialRegression(degree),
                                                  X, y, train_sizes, cv=5,
                                                  scoring=rms_error)
    plot_with_err(N_train, val_train, label='training scores')
    plot_with_err(N_train, val_test, label='validation scores')
    plt.xlabel('Training Set Size'); plt.ylabel('rms error')
    plt.ylim(0, 3)
    plt.xlim(5, 80)
    plt.legend()

In [None]:
degree = 1
X, y = make_data(200, error=1.0, random_seed=degree)
plot_learning_curve(degree)

This shows a typical learning curve: for very few training points, there is a large separation between the training and test error. As the data set size increases, the training and testing errors converge and plateau out. 

t is easy to see that, in this plot, if you'd like to reduce the MSE down to the nominal value of 1.0 (which is the inherient noise we added), then adding more samples will never get you there.

What about now for the other extreme of high variance and low bias?

In [None]:
degree = 9
X, y = make_data(200, error=1.0, random_seed=degree)
plot_learning_curve(degree)

Here we see that by adding more model complexity, we've managed to lower the level of convergence to an rms error of 1.0! But the convergence happens for arge amounts of training data.

So we see that:

* you can cause the lines to converge by adding more points or by simplifying the model
* you can bring the convergence error down only by increasing the complexity of the model

Thus these curves can give you hints about how you might improve a sub-optimal model. If the curves are already close together, you need more model complexity. If the curves are far apart, you might also improve the model by adding more data.

## Wage Data


In [None]:
wage = pd.read_csv("../data/Wage.csv", index_col='id')
wage.head(5)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,8))

age_mean = wage.groupby(by='age').wage.mean()
axes[0].plot(age_mean.index, age_mean.values, '-', color='b', lw=3, label='Mean')
axes[0].scatter(wage.age, wage.wage, color='#cccccc')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Wage')
remove_border(axes[0])

year_mean = wage.groupby(by='year').wage.mean()
axes[1].plot(year_mean.index, year_mean.values, '-', color='b', lw=3, label='Mean')
axes[1].scatter(wage.year, wage.wage, color='#cccccc')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Wage')
remove_border(axes[1])


year_mean = wage.groupby(by='year').wage.mean()
axes[1].plot(year_mean.index, year_mean.values, '-', color='b', lw=3, label='Mean')
axes[1].scatter(wage.year, wage.wage, color='#cccccc')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Wage')
remove_border(axes[1])

wage.boxplot('wage', by='education')

## Sales Data

In [None]:
data = pd.read_csv('../data/Advertising.csv', index_col='id')
data.head()

In [None]:
regr = linear_model.LinearRegression()
regr.fit(data[['TV']], data.Sales)

In [None]:
tv = np.arange(0,300)
plt.plot( regr.coef_*tv + regr.intercept_, color='b')

plt.scatter(data.TV, data.Sales, color='k')
plt.xlabel('TV')
plt.ylabel('Sales')

In [None]:
sse = np.sum((regr.predict(data[['TV']]) - data.Sales) ** 2, axis=0) / float(data[['TV']].shape[0] - data[['TV']].shape[1])

In [None]:
se = np.array([
            np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(data[['TV']].T, data[['TV']]))))
                                                    for i in range(1,1)
                    ])

In [None]:
np.sum((regr.predict(data[['TV']]) - data.Sales) ** 2, axis=0)

In [None]:
import statsmodels.api as sm
from patsy import dmatrices

In [None]:
 y, X = dmatrices('Sales ~ TV + Radio + Newspaper', data=df, return_type='dataframe')

In [None]:
 mod = sm.OLS(y, X)

In [None]:
 res = mod.fit() 

In [None]:
print res.summary()  

In [None]:
import scipy.stats as stats

corr = {}
corr['pearson'], _ = stats.pearsonr(data.Radio,data.Sales)
corr['spearman'], _ = stats.spearmanr(data.Radio,data.Sales)
corr['kendall'], _ = stats.kendalltau(data.Radio,data.Sales)

print(corr)