In [None]:
import pandas as pd
import numpy as np

In [None]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data_small.csv.zip', dtype = dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv.zip', dtype = dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv.zip', dtype = dtype_dict)
valid = pd.read_csv('kc_house_data_small_validation.csv.zip', dtype = dtype_dict)

In [None]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column to a dataframe
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of dataframe given by the ‘features’ list into the SFrame ‘features_sframe’

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = data[features].as_matrix(columns=None)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = data[output].as_matrix(columns=None) 
    return(features_matrix, output_array)

In [None]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features,norms)

In [None]:
feats = ['bedrooms','bathrooms','sqft_living',  'sqft_lot',  'floors','waterfront',  'view',  'condition',  'grade',  'sqft_above',  'sqft_basement','yr_built',  'yr_renovated',  'lat','long','sqft_living15','sqft_lot15']
train_feat_mat , train_out = get_numpy_data(train,feats,output)
test_feat_mat , test_out = get_numpy_data(test,feats,output)
valid_feat_mat , valid_out = get_numpy_data(valid,feats,output)

In [None]:
print(test_feat_mat[0])
print(train_feat_mat[9])

In [None]:
train_normalized , norms = normalize_features(train_feat_mat)
test_normalized = test_feat_mat / norms
valid_normalized = valid_feat_mat / norms

In [None]:
print(train_normalized[9])
print(test_normalized[0])

In [None]:
distance10_1 = np.sqrt(np.sum((train_normalized[9] - test_normalized[0])**2))

In [None]:
distance10_1

In [None]:
min_dist = (1e99,-1)
for i in range(10):
    distance = np.sqrt(np.sum((train_normalized[i] - test_normalized[0])**2))
    if (distance < min_dist[0]):
        min_dist = (distance,i)
    else:
        continue
min_dist

In [None]:
diff = train_normalized[:] - test_normalized[0]

In [None]:
distances = np.sqrt(np.sum(diff**2,axis=1))

In [None]:
distances[100]

In [101]:
def compute_distances(query):
    diff = train_normalized - valid_normalized[query]
    distances = np.sqrt(np.sum(diff**2,axis=1))
    return distances

In [None]:
dist_3 = compute_distances(2)

In [None]:
dist_3

In [None]:
min(dist_3)

In [67]:
np.argmin(dist_3)

382

In [68]:
train_out[382]

249000.0

In [103]:
def fst_NN (query):
    dists = compute_distances(query)
    return np.argmin(dists)

In [102]:
def K_NN(query,k):
    dist = compute_distances(query)
    return np.argsort(dist,axis=0)[:k] # return np.argsort(dist,axis=0)[:k]

In [74]:
K_NN(2,4)

array([ 382, 1149, 4087, 3142], dtype=int64)

In [104]:
def predict_output_of_query(k,query):
    k_ind = K_NN(query,k)
    prediction = np.sum(train_out[k_ind])/k
    return prediction

In [78]:
predict_output_of_query(4,2)

413987.5

In [105]:
def predict_out(k,feats_query):
    query = feats_query.shape[0]
    predicted = list()
    for i in range(query):
        predicted.append(predict_output_of_query(k,i))
    return predicted

In [106]:
np.argmin(predict_out(10,test_normalized[0:10]))

9

In [108]:
K_results_errors = list()
for k in range (1,2):
    valid_outputs = predict_out(k,valid_normalized)
    error = sum((valid_outputs - valid_out)**2)
    K_results_errors.append(error)
K_results_errors

[105453830251561.0]