In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

<h1>Import dependencies</h1>

In [2]:
# Math modules
import numpy as np
import numpy.random as rnd
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

<hr style="border-top: 3px solid Black;">

<div style="font-size:16px; border:1px solid black; padding:10px">
        <center><h3>Learning Curves: Spilling the "Tea" on over or under performing ML Models.</h3></center>
<ul>
    <li>Models that overfit the training data will not generalize well with new data.</li><br>
    <li>Models that underfit the data, will not generalize well with the training, and validation datasets.</li><br>
    <li>Learning curves can further help diagnose if either of these are problems with the model used.</li><br>
    <li>Learning models plots the following:
        <ul>
            <li>x-axis: the size of the training set data</li>
            <li>y-axis: the root mean square error (loss, or error in predictions).</li>
        </ul>
    </li>        
</ul>        
</div>

In [3]:
def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   
    plt.xlabel("Training set size", fontsize=14) 
    plt.ylabel("RMSE", fontsize=14)              