#### Import Libraries

In [109]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

#### Import Data

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
sales = pd.read_csv('kc_house_data_small.csv', dtype = dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv', dtype = dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype = dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype = dtype_dict)

In [5]:
print(sales.head())
print(train.head())
print(validation.head())
print(test.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0       3.0       1.00       1180.0   
1  6414100192  20141209T000000  538000.0       3.0       2.25       2570.0   
2  5631500400  20150225T000000  180000.0       2.0       1.00        770.0   
3  2487200875  20141209T000000  604000.0       4.0       3.00       1960.0   
4  1954400510  20150218T000000  510000.0       3.0       2.00       1680.0   

   sqft_lot  floors  waterfront  view     ...      grade  sqft_above  \
0      5650     1.0           0     0     ...          7        1180   
1      7242     2.0           0     0     ...          7        2170   
2     10000     1.0           0     0     ...          6         770   
3      5000     1.0           0     0     ...          7        1050   
4      8080     1.0           0     0     ...          8        1680   

   sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \
0              0      1955 

#### Function to convert Data to Array

In [6]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column - for intercept term
    
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.array(data.loc[:,features])
 

    # this will convert the SArray into a numpy array:
    output_array = np.array(data.loc[:,output]) 
    return(features_matrix, output_array)


#### Function to normalize features

In [7]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0) ### sqrt(f1^2 + f2^2 + ......... + fd^2)
    normalized_features = features / norms
    return (normalized_features, norms)

#### Convert Data to Array

In [8]:
features = ['bathrooms', 'waterfront', 'sqft_above', 'sqft_living15', 'grade', 'yr_renovated', 'bedrooms', 'long', 'sqft_lot15', 'sqft_living', 'floors', 'condition', 'lat', 'sqft_basement', 'yr_built', 'sqft_lot', 'view']
output = ['price']
features_train, output_train = get_numpy_data(train, features, output)

In [9]:
features_validation, output_validation = get_numpy_data(validation, features, output)
features_test, output_test = get_numpy_data(test, features, output)

#### Normalize Data

In [10]:
features_train_norm, norms = normalize_features(features_train)

print(features_train_norm)
print(norms)

[[0.01345102 0.00602491 0.         ... 0.01333931 0.00181386 0.        ]
 [0.01345102 0.01355605 0.         ... 0.01331201 0.00232495 0.        ]
 [0.01345102 0.00602491 0.         ... 0.0131892  0.00321036 0.        ]
 ...
 [0.01345102 0.01506227 0.         ... 0.01366682 0.00177726 0.        ]
 [0.01345102 0.01506227 0.         ... 0.01374187 0.0019336  0.        ]
 [0.01345102 0.01506227 0.         ... 0.01370776 0.00036309 0.        ]]
[7.43437960e+01 1.65977596e+02 6.32455532e+00 1.45365513e+05
 1.55531640e+05 5.75318173e+02 2.99361461e+04 2.57850732e+02
 9.08552952e+03 2.46533053e+06 1.67688695e+05 1.17240138e+02
 2.57906960e+02 3.53588887e+03 3.90863965e+04 1.46559338e+05
 3.11491224e+06 5.87962584e+01]


In [11]:
features_test_norm = features_test / norms
features_validation_norm = features_validation / norms

print(features_test_norm)
print(features_validation_norm)

[[0.01345102 0.01807473 0.         ... 0.01350306 0.00160518 0.05102365]
 [0.01345102 0.00602491 0.         ... 0.0130732  0.00138046 0.        ]
 [0.01345102 0.01054359 0.         ... 0.01329154 0.00204821 0.        ]
 ...
 [0.01345102 0.01355605 0.         ... 0.01370776 0.00032457 0.        ]
 [0.01345102 0.02108718 0.         ... 0.01374187 0.00179042 0.        ]
 [0.01345102 0.01054359 0.         ... 0.01374187 0.00384216 0.        ]]
[[0.01345102 0.01054359 0.         ... 0.01328472 0.00159876 0.        ]
 [0.01345102 0.01506227 0.         ... 0.01361223 0.00202253 0.        ]
 [0.01345102 0.01054359 0.         ... 0.0131619  0.00160518 0.        ]
 ...
 [0.01345102 0.01506227 0.         ... 0.01370094 0.00032746 0.        ]
 [0.01345102 0.00903736 0.         ... 0.01370776 0.00056759 0.        ]
 [0.01345102 0.01204982 0.         ... 0.01374187 0.00036149 0.        ]]


#### Qn 1

In [12]:
print(features_test_norm[0])
print(features_train_norm[9])

[ 0.01345102  0.01807473  0.          0.01362084  0.01375926  0.01564352
  0.          0.01551285 -0.01346922  0.0016225   0.01759212  0.017059
  0.0116321   0.01345387  0.02481682  0.01350306  0.00160518  0.05102365]
[ 0.01345102  0.00602491  0.          0.0096309   0.01195898  0.01390535
  0.          0.01163464 -0.01346251  0.00156612  0.0083488   0.01279425
  0.01938684  0.01346821  0.          0.01302544  0.00050756  0.        ]


In [13]:
np.sqrt(np.sum((features_train_norm[9] - features_test_norm[0]) ** 2))

0.05972359371398078

In [24]:
### Test

np.sum(features_train_norm[-1] - features_test_norm[0])

-0.09343399874654644

#### Qn 2

In [45]:
dist = []

for i in range(10):
    dist.append(np.sqrt(np.sum((features_train_norm[i] - features_test_norm[0]) ** 2)))

In [46]:
np.argmin(dist)

8

#### Distance Function

In [48]:
def compute_distances(features_instances, features_query):
    
    distances = []
    for i in range(len(features_instances)):
        distances.append(np.sqrt(np.sum((features_instances[i] - features_query) ** 2)))
    
    return distances

#### Qn 3

In [53]:
dist_qn3 = compute_distances(features_train_norm, features_test_norm[2])

In [55]:
np.argmin(dist_qn3)

382

#### Qn 4

In [56]:
output_train[382]

array([249000.])

#### Qn 5

In [57]:
def k_nearest_neighbors(k, feature_train, features_query):

    distances = []
    for i in range(len(feature_train)):
        distances.append(np.sqrt(np.sum((feature_train[i] - features_query) ** 2)))
        
    sort_list = np.argsort(distances)
    neighbors = sort_list[:k]
    
    return neighbors

In [58]:
k_nearest_neighbors(4, features_train_norm, features_test_norm[2])

array([ 382, 1149, 4087, 3142], dtype=int64)

#### Qn 6

In [62]:
def predict_output_of_query(k, features_train, output_train, features_query):
    
    k_neighbors = k_nearest_neighbors(k, features_train, features_query)
    
    total_price = 0
    
    for item in k_neighbors:
        total_price = total_price + output_train[item]
    
    prediction = total_price/k
        
    return prediction

In [63]:
predict_output_of_query(4, features_train_norm, output_train, features_test_norm[2])

array([413987.5])

#### Qn 7

In [130]:
def predict_output(k, features_train, output_train, features_query):
    
    predictions = []
    
    for house in features_query:
        predictions.append(predict_output_of_query(k, features_train, output_train, house))
        
    return predictions

In [131]:
prediction_multi_query_point = predict_output(10, features_train_norm, output_train, features_test_norm[0:10])

In [128]:
prediction_multi_query_point

[array([881300.]),
 array([431860.]),
 array([460595.]),
 array([430200.]),
 array([766750.]),
 array([667420.]),
 array([350032.]),
 array([512800.7]),
 array([484000.]),
 array([457235.])]

#### Qn 8

In [135]:
k_validation_pred = []

for k in range(1,16):
    k_validation_pred.append(predict_output(k, features_train_norm, output_train, features_validation_norm))

In [136]:
len(k_validation_pred)

15

In [139]:
k_mean_rss = []

for item in k_validation_pred:
    k_mean_rss.append(mean_squared_error(item, output_validation))

In [144]:
best_k = np.argmin(k_mean_rss) + 1

best_k

8

In [145]:
test_pred = predict_output(best_k, features_train_norm, output_train, features_test_norm)

In [146]:
test_rss = mean_squared_error(test_pred, output_test) * len(output_test)

test_rss/1e14

1.331188235515168