In [2]:
import pandas as pd
import wget
from datetime import datetime

Loading data

In [3]:
wget.download('https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/graphing.py')
wget.download('https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/microsoft_custom_linear_regressor.py')
wget.download('https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/seattleWeather_1948-2017.csv')

'seattleWeather_1948-2017 (5).csv'

In [4]:
dataset = pd.read_csv('seattleWeather_1948-2017.csv', parse_dates=['date'])

# Convert the dates into numbers so we can use them in our models
# We make a year column that can contain fractions. For example,
# 1948.5 is halfway through the year 1948

dataset['year'] = [(d.year + d.timetuple().tm_yday / 365.5) for d in dataset.date]

# For the sake of this exercise, let's look at February 1 for the following years:
desired_dates = [
    datetime(1950,2,1),
    datetime(1960,2,1),
    datetime(1970,2,1),
    datetime(1980,2,1),
    datetime(1990,2,1),
    datetime(2000,2,1),
    datetime(2010,2,1),
    datetime(2017,2,1),
]

dataset = dataset[dataset.date.isin(desired_dates)].copy()

dataset.head()

Unnamed: 0,date,amount_of_precipitation,max_temperature,min_temperature,rain,year
762,1950-02-01,0.0,27,1,False,1950.087551
4414,1960-02-01,0.15,52,44,True,1960.087551
8067,1970-02-01,0.0,50,42,False,1970.087551
11719,1980-02-01,0.37,54,36,True,1980.087551
15372,1990-02-01,0.08,45,37,True,1990.087551


Comparing two functions

In [5]:
import numpy as np

def sum_of_square_differences(estimate, actual):
    return np.sum((estimate - actual) ** 2)

def sum_of_absolute_difference(estimate, actual):
    return np.sum(np.abs(estimate - actual))

Lets test

In [6]:
actual_label = np.array([1, 3])
model_estimate = np.array([2,2])

print('SSD: ', sum_of_square_differences(model_estimate,actual_label))

print('SAD: ', sum_of_absolute_difference(model_estimate, actual_label))

SSD:  2
SAD:  2


We have an error of 1 for each estimate, and both methods have returned the same error.

What happens if we distribute these errors differently? Let's pretend that we estimated the first value perfectly but were off by 2 for the second value:

In [8]:
actual_label = np.array([1, 3])
model_estimate = np.array([1, 1])

print("SSD:", sum_of_square_differences(model_estimate, actual_label))
print("SAD:", sum_of_absolute_difference(model_estimate, actual_label))

SSD: 4
SAD: 2


Difference in action
Lets fit the model

In [12]:
from microsoft_custom_linear_regressor import MicrosoftCustomLinearRegressor
import graphing


# Fit a model by using sum of square differences

model = MicrosoftCustomLinearRegressor().fit(X=dataset.year,
y=dataset.min_temperature,
cost_function=sum_of_square_differences)

# graph the model

graphing.scatter_2D(dataset, label_x='year', label_y='min_temperature', trendline=model.predict)

Now using Sum of absolute differences SAD

In [14]:
model = MicrosoftCustomLinearRegressor().fit(X = dataset.year, y = dataset.min_temperature, cost_function=sum_of_absolute_difference)

# Graphing the module
graphing.scatter_2D(dataset, label_x='year', label_y='min_temperature', trendline=model.predict)