# Lecture 16 Predicting Numerical Values: Getting Started with Regression (Part II)

## 16.1 Setup

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline 

In [2]:
from sklearn import (datasets,
                     linear_model,
                     metrics,
                     model_selection as skms,
                     neighbors)

In [3]:
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [4]:
pd.options.display.float_format = '{:20,.4f}'.format

## 16.2 Optimization

In [5]:
tgt = np.array([3,5,8,10,12,15])

In [6]:
# random guessing
num_guesses = 10
results = []
for g in range(num_guesses):
    guess = np.random.uniform(low=tgt.min(),
                              high=tgt.max())
    total_sse = np.sum((tgt-guess)**2)
    results.append((total_sse, guess))
best_guess = sorted(results)[0][1]
best_guess

7.49448142616835

In [7]:
# random step, if it's good
num_steps = 100
step_size = 0.05

best_guess = np.random.uniform(low=tgt.min(),
                               high=tgt.max())
best_sse = np.sum((tgt-best_guess)**2)

for s in range(num_steps):
    new_guess = best_guess + (np.random.choice([+1, -1]) * step_size)
    new_sse   = np.sum((new_guess-tgt)**2)
    if new_sse < best_sse:
        best_guess = new_guess
        best_sse   = new_sse
print(best_guess)

5.097013931549623


In [8]:
# smart steps
num_steps = 100
step_size = 0.02

best_guess = np.random.uniform(low=tgt.min(),
                               high=tgt.max())
best_sse = np.sum((tgt-best_guess)**2)
print('start:', best_guess)

for s in range(num_steps):
    guesses = best_guess + (np.array([-1, +1]) * step_size)
    sses    = np.sum((tgt[:,np.newaxis] - guesses)**2, axis=0)
    guess_idx = np.argmin(sses)
    
    if sses[guess_idx] < best_sse:
        best_guess = guesses[guess_idx]
        best_sse   = sses[guess_idx]
    else:
        break

print('end:', best_guess)

start: 6.256188381286751
end: 8.256188381286709


In [9]:
# 7.5, 5.1, 8.3
np.mean(tgt)

8.833333333333334

## 16.3 Learning Performance of Regressors

In [10]:
# stand alone code
from sklearn import (datasets, 
                     linear_model, 
                     model_selection as skms,
                     metrics,
                     neighbors)

# dataset
diabetes = datasets.load_diabetes()

# tts
tts =  skms.train_test_split(diabetes.data,
                             diabetes.target, 
                             test_size=.25)
(diabetes_train, diabetes_test, 
 diabetes_train_tgt, diabetes_test_tgt) = tts

# define some models
models = {'3-NN': neighbors.KNeighborsRegressor(n_neighbors=3),
          '5-NN': neighbors.KNeighborsRegressor(n_neighbors=5),
          '10-NN': neighbors.KNeighborsRegressor(n_neighbors=10),
          '20-NN': neighbors.KNeighborsRegressor(n_neighbors=20),
          'linreg' : linear_model.LinearRegression()}

for name, model in models.items():
    fit   = model.fit(diabetes_train, diabetes_train_tgt)
    preds = fit.predict(diabetes_test)
    score = np.sqrt(metrics.mean_squared_error(diabetes_test_tgt, 
                                               preds))
    print("{:>6s} : {:0.2f}".format(name,score))

  3-NN : 55.20
  5-NN : 53.01
 10-NN : 53.30
 20-NN : 52.80
linreg : 48.71
