In [24]:
import pandas as pd
import numpy as np
dc_listings = pd.read_csv("dc_airbnb.csv")

In [28]:
dc_listings["price"] = dc_listings["price"].str.replace("$","").str.replace(",","").astype(float)
dc_listings = dc_listings.drop(["cleaning_fee","security_deposit"], axis = 1)
cols = ["host_response_rate","host_acceptance_rate", 
        "room_type","latitude","longitude","city","zipcode","state"]
dc_listings = dc_listings.drop(cols, axis = 1)
dc_listings.dropna(axis = 0, inplace = True)

In [29]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3671 entries, 0 to 3722
Data columns (total 9 columns):
host_listings_count    3671 non-null int64
accommodates           3671 non-null int64
bedrooms               3671 non-null float64
bathrooms              3671 non-null float64
beds                   3671 non-null float64
price                  3671 non-null float64
minimum_nights         3671 non-null int64
maximum_nights         3671 non-null int64
number_of_reviews      3671 non-null int64
dtypes: float64(4), int64(5)
memory usage: 286.8 KB


In [31]:
normalize_listing = (dc_listings -dc_listings.mean())/dc_listings.std()

In [34]:
normalize_listing["price"] = dc_listings["price"]
normalize_listing.isnull().sum()

host_listings_count    0
accommodates           0
bedrooms               0
bathrooms              0
beds                   0
price                  0
minimum_nights         0
maximum_nights         0
number_of_reviews      0
dtype: int64

# Holdout Validation

In [35]:
total_rows = normalize_listing.shape[0]

In [36]:
split_one = normalize_listing[:int(total_rows*.5)]
split_two = normalize_listing[int(total_rows*.5):]


In [37]:
features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"

In [43]:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor(algorithm='brute')
knn.fit(split_one[features],split_one[target])
predictions = knn.predict(split_two[features])

print(predictions)
mse = mean_squared_error(split_two[target], predictions)
rmse1 = np.sqrt(mse)
rmse1

[206.6 107.4 116.  ... 198.2 116.  141.8]


123.34966187617752

In [44]:
knn = KNeighborsRegressor(algorithm='brute')
knn.fit(split_two[features],split_two[target])
predictions = knn.predict(split_one[features])



mse = mean_squared_error(split_one[target], predictions)
rmse2 = np.sqrt(mse)
rmse2

123.184893355288

In [45]:
mean_rmse = np.mean([rmse1,rmse2])
mean_rmse

123.26727761573275

In [46]:
total_rows/5

734

In [47]:
total_rows

3671

0-734

734 - 1468

1468 - 2202

2202 - 2936

2936 :

In [48]:
normalize_listing.loc[:734,"fold"] = 1
normalize_listing.loc[734:1468,"fold"] = 2
normalize_listing.loc[1468:2202,"fold"] = 3
normalize_listing.loc[2202:2936,"fold"] = 4
normalize_listing.loc[2936:,"fold"] = 5

In [49]:
normalize_listing["fold"].value_counts()

5.0    784
4.0    728
3.0    721
1.0    720
2.0    718
Name: fold, dtype: int64

In [50]:
features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"

In [51]:
folds = [1,2,3,4,5]

rmses = []
for i in folds:
    train = normalize_listing[normalize_listing["fold"] != i]
    test = normalize_listing[normalize_listing["fold"] == i]
    knn = KNeighborsRegressor(algorithm='brute')
    knn.fit(train[features],train[target])
    predictions = knn.predict(test[features])
    mse = mean_squared_error(test[target], predictions)
    rmse = np.sqrt(mse)
    rmses.append(rmse)

In [53]:
mean_rmse = np.mean(rmses)
mean_rmse

116.9745639445794

In [54]:
from sklearn.model_selection import KFold, cross_val_score

In [55]:
features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"
kf = KFold(n_splits=5, shuffle = True, random_state=1)
knn = KNeighborsRegressor()
mse = cross_val_score(knn, normalize_listing[features],normalize_listing[target],
                     scoring = "neg_mean_squared_error",cv = kf)
mse = np.abs(mse)
rmse = np.sqrt(mse)
mean_rmse = np.mean(rmse)
mean_rmse
mse

array([13286.19847619, 10691.00239782, 18208.94485014, 13772.38599455,
       19043.51514986])

In [None]:
rmses = []
for train,test in kf.split(normalize_listing.index):
    train_data = normalize_listing.loc[train]
    test_data = normalize_listing.loc[test]
    knn = KNeighborsRegressor(algorithm='brute')
    knn.fit(train_data[features],train_data[target])
    predictions = knn.predict(test_data[features])
    mse = mean_squared_error(test_data[target], predictions)
    rmse = np.sqrt(mse)
    rmses.append(rmse)

In [None]:
fold = [2,3,5,10,15]

features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"

fold_mrmse = {}
for f in fold:
    kf = KFold(n_splits=f, shuffle = True, random_state=1)
    knn = KNeighborsRegressor()
    mse = cross_val_score(knn, normalize_listing[features],normalize_listing[target],
                         scoring = "neg_mean_squared_error",cv = kf)
    mse = np.abs(mse)
    rmse = np.sqrt(mse)
    mean_rmse = np.mean(rmse)
    fold_mrmse[f] = mean_rmse

In [None]:
fold_mrmse