In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from google.colab import drive
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

kc_house_data_small = pd.read_csv('gdrive/My Drive/uwml/kc_house_data_small.csv', dtype=dtype_dict)
testing = pd.read_csv('gdrive/My Drive/uwml/kc_house_data_small_test.csv', dtype=dtype_dict)
training = pd.read_csv('gdrive/My Drive/uwml/kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('gdrive/My Drive/uwml/kc_house_data_validation.csv', dtype=dtype_dict)

In [4]:
sales = kc_house_data_small.copy()
sales

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180.0,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770.0,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960.0,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680.0,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8698,0844000965,20140626T000000,224000.0,3.0,1.75,1500.0,11968,1.0,0,0,3,6,1500,0,2014,0,98010,47.3095,-122.002,1320.0,11303.0
8699,7852140040,20140825T000000,507250.0,3.0,2.50,2270.0,5536,2.0,0,0,3,8,2270,0,2003,0,98065,47.5389,-121.881,2270.0,5731.0
8700,9834201367,20150126T000000,429000.0,3.0,2.00,1490.0,1126,3.0,0,0,3,8,1490,0,2014,0,98144,47.5699,-122.288,1400.0,1230.0
8701,3448900210,20141014T000000,610685.0,4.0,2.50,2520.0,6023,2.0,0,0,3,9,2520,0,2014,0,98056,47.5137,-122.167,2520.0,6023.0


In [5]:
# Function to create regression inputs
def get_numpy_data(df, features, output):
  df['constant'] = 1
  x_list = ['constant'] + features
  x = df[x_list]
  x = x.to_numpy()
  y = df[output]
  y = y.to_numpy()
  return(x, y)

In [6]:
# Normalise features function
def normalize_features(features):
  norms = np.linalg.norm(features, axis=0)
  normalized_features = features / norms
  return (normalized_features, norms)

In [7]:
# Test norm function
test = np.array([[1.,2.,3.],[4.,5.,6.],[7.,8.,9.]])
nms = normalize_features(test)
print(test)
print(nms[0])
print(nms[1])

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
[[0.12309149 0.20739034 0.26726124]
 [0.49236596 0.51847585 0.53452248]
 [0.86164044 0.82956136 0.80178373]]
[ 8.1240384   9.64365076 11.22497216]


In [8]:
# Step 
# Features
features = [
  'bedrooms',  
  'bathrooms',  
  'sqft_living',  
  'sqft_lot',  
  'floors',
  'waterfront',  
  'view',  
  'condition',  
  'grade',  
  'sqft_above',  
  'sqft_basement',
  'yr_built',  
  'yr_renovated',  
  'lat',  
  'long',  
  'sqft_living15',  
  'sqft_lot15'
  ]

In [9]:
# Step 5
# Numpy arrays of training, test and validation sets
np_train_x_raw, np_train_y = get_numpy_data(training, features, 'price')
np_valid_x_raw, np_valid_y = get_numpy_data(validation, features, 'price')
np_test_x_raw, np_test_y = get_numpy_data(testing, features, 'price')

In [10]:
# Step 6
# Normalisation
train_x, train_norms = normalize_features(np_train_x_raw)
valid_x = np_valid_x_raw / train_norms
test_x = np_test_x_raw / train_norms

In [11]:
# Step 7
print(test_x[0])
print(train_x[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [12]:
# Step 8 / Q. 1
def eucl_dist(j, q):
  return np.sqrt(np.sum(np.square(j - q)))

In [13]:
eucl_dist(test_x[0], train_x[9])

0.05972359371398078

In [14]:
# Step 9 / Q. 2
counter = 0
dist = []
for i in train_x[0:10]:
  d = eucl_dist(test_x[0], i)
  dist.append(d)
print(dist)


[0.06027470916295592, 0.08546881147643746, 0.06149946435279315, 0.05340273979294363, 0.05844484060170442, 0.059879215098128345, 0.05463140496775461, 0.055431083236146074, 0.052383627840220305, 0.05972359371398078]


In [15]:
# Step 9 / Q. 2
counter = 0

for i in train_x[0:10]:
  dist = eucl_dist(test_x[0], i)
  print(counter, ' : ', dist)
  counter+=1

0  :  0.06027470916295592
1  :  0.08546881147643746
2  :  0.06149946435279315
3  :  0.05340273979294363
4  :  0.05844484060170442
5  :  0.059879215098128345
6  :  0.05463140496775461
7  :  0.055431083236146074
8  :  0.052383627840220305
9  :  0.05972359371398078


In [16]:
# Step 12
diff = train_x - test_x[0]

In [17]:
print(diff[1])
print('\n')
print(len(diff))
print('\n')
print(diff[-1].sum())

[ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
  7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
 -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
 -2.89330197e-03  1.47606982e-03]


5527


-0.09343399874654643


In [18]:
print(np.sum(diff**2, axis=1)[15])
print(np.sum(diff[15]**2))

0.0033070590284564457
0.0033070590284564453


In [19]:
# Step 14
distances = np.sqrt(np.sum((train_x - test_x[0])**2, axis=1))

distances[100]         

0.023708232416678195

In [20]:
# Step 15
def compute_distances(features_instances, features_query):
  distances = np.sqrt(np.sum((features_instances - features_query)**2, axis=1))
  return distances

In [21]:
# Step 16 / Q. 3
dist = compute_distances(train_x, test_x[2])
dist_index = np.argsort(dist)
dist_index_smallest = dist_index[0]
dist_index_smallest

382

In [22]:
# Step 17 / Q. 4
np_train_y[dist_index_smallest]

249000.0

In [23]:
# Step 18
def k_nearest_neighbours(k, feature_train, feature_query):
  dist = compute_distances(feature_train, feature_query)
  dist_index = np.argsort(dist)
  neighbours = dist_index[0:k]
  return neighbours

In [24]:
# Step 19 / Q. 5
k_nearest_neighbours(k=4, feature_train=train_x, feature_query=test_x[2])

array([ 382, 1149, 4087, 3142])

In [25]:
# Step 20
def predict_output_of_query(k, feature_train, output_train, feature_query):
  neighbours_index = k_nearest_neighbours(k, feature_train, feature_query)
  neighbours_price = output_train[neighbours_index]
  predictions = neighbours_price.mean()
  return predictions

In [26]:
# Step 21 / Q. 6
predict_output_of_query(k=4, feature_train=train_x, output_train=np_train_y, feature_query=test_x[2])

413987.5

In [27]:
# Step 22
def predict_output(k, feature_train, output_train, feature_query):
  prediction_list = []
  for i in range(feature_query.shape[0]):
    prediction = predict_output_of_query(k, feature_train, output_train, feature_query[i])
    prediction_list.append(prediction)
  return prediction_list

In [28]:
# Step 23 / Q. 7
preds = predict_output(k=10, feature_train=train_x, output_train=np_train_y, feature_query=test_x[0:10])
preds

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [29]:
# Step 24

def get_residual_sum_of_squares(yhat, y):
  e = yhat - y
  RSS = np.sum(e * e)
  return(RSS)

for i in list(range(1,16)):
  yhat = predict_output(k=i, feature_train=train_x, output_train=np_train_y, feature_query=valid_x)
  rss = get_residual_sum_of_squares(yhat=yhat, y=np_valid_y)
  print(i, ' : ', rss/1e12)

1  :  105.453830251561
2  :  83.4450735040255
3  :  72.69209601920257
4  :  71.94672165209168
5  :  69.8465174197186
6  :  68.89954435318083
7  :  68.3419734500511
8  :  67.3616787354915
9  :  68.3727279589761
10  :  69.33504866855674
11  :  69.52385521559883
12  :  69.04996958724617
13  :  70.01125450826369
14  :  70.90869886903434
15  :  71.10692838594515


In [30]:
# Step 25 / Q. 8
yhat = predict_output(k=8, feature_train=train_x, output_train=np_train_y, feature_query=test_x)
get_residual_sum_of_squares(yhat=yhat, y=np_test_y)

133118823551516.81