In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score, make_scorer, mean_squared_error, median_absolute_error, mean_absolute_error
from xgboost import XGBRegressor

In [2]:
# DATA CLEANING


# Load the dataset
dataset = pd.read_csv('data/AB_NYC_2019.csv')

# Drop unwanted columns
dataset.drop('id', axis=1, inplace=True)
dataset.drop('name', axis=1, inplace=True)
dataset.drop('host_id', axis=1, inplace=True)
dataset.drop('host_name', axis=1, inplace=True)
dataset.drop('number_of_reviews', axis=1, inplace=True)
dataset.drop('last_review', axis=1, inplace=True)
dataset.drop('reviews_per_month', axis=1, inplace=True)
dataset.drop('latitude', axis=1, inplace=True)
dataset.drop('longitude', axis=1, inplace=True)

# Remove price = 0
dataset = dataset[dataset.price > 0].copy()

# Standardize ratio attributes
dataset[['minimum_nights', 'calculated_host_listings_count', 'availability_365']] = StandardScaler().fit_transform(dataset[['minimum_nights', 'calculated_host_listings_count', 'availability_365']])

# Log price
dataset['log_price'] = np.log1p(dataset['price'])

# One hot encoding for the categorical features
room_type_dummies = pd.get_dummies(dataset['room_type'], prefix="is_room")
neighbourhood_dummies = pd.get_dummies(dataset['neighbourhood'], prefix="is_neighbourhood")
preprocessed_dataset = pd.concat([dataset[['minimum_nights','calculated_host_listings_count',
       'availability_365', 'neighbourhood_group']], room_type_dummies, neighbourhood_dummies, dataset[['price', 'log_price']]], axis=1)

# Dataset division neighbourhood group:['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']
datasets = {}
datasets['brooklyn'] = preprocessed_dataset[preprocessed_dataset.neighbourhood_group == 'Brooklyn'].copy().drop('neighbourhood_group', axis=1)[preprocessed_dataset.price < 300].copy()
datasets['manhattan'] = preprocessed_dataset[preprocessed_dataset.neighbourhood_group == 'Manhattan'].copy().drop('neighbourhood_group', axis=1)[preprocessed_dataset.price < 400].copy()
datasets['queens'] = preprocessed_dataset[preprocessed_dataset.neighbourhood_group == 'Queens'].copy().drop('neighbourhood_group', axis=1)[preprocessed_dataset.price < 120].copy()
datasets['staten_island'] = preprocessed_dataset[preprocessed_dataset.neighbourhood_group == 'Staten Island'].copy().drop('neighbourhood_group', axis=1)[preprocessed_dataset.price < 120].copy()
datasets['bronx'] = preprocessed_dataset[preprocessed_dataset.neighbourhood_group == 'Bronx'].copy().drop('neighbourhood_group', axis=1)[preprocessed_dataset.price < 100].copy()

# # plots
# sb.distplot(preprocessed_dataset['price'], color='blue')
# plt.title('Skewed price')
# plt.show()

# sb.distplot(preprocessed_dataset['log_price'], color='green')
# plt.show()



In [3]:
# DIMENSIONALILTY REDUCTION

sets = {}

for name, preprocessed_dataset in datasets.items():
    print(name)
    # Principle Component Analysis
    X = preprocessed_dataset.copy().drop('price', axis=1).drop('log_price', axis=1)
    y = preprocessed_dataset['log_price']

    print(X.shape, y.shape)
    pca = PCA()
    pca.fit(X)

    cutoff_variance = 0.98 # 0.95
    cutoff = 0
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    for i in range(len(cumulative_variance)):
        if cumulative_variance[i] > cutoff_variance:
            cutoff = i
            break

    print('cutoff =', cutoff)

    pca = PCA(n_components=cutoff)
    pcaX = pca.fit_transform(X)
    pcaX.shape

    # get the training and testing sets (PCA)
    sets[name] = {}
    X_train, X_test, y_train, y_test = train_test_split(pcaX, y, test_size=0.25, random_state=101)
    sets[name]['X_train'] = X_train
    sets[name]['X_test'] = X_test
    sets[name]['y_train'] = y_train
    sets[name]['y_test'] = y_test
    print(sets.keys())
#     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

manhattan
(20208, 227) (20208,)
cutoff = 20
dict_keys(['manhattan'])
queens
(4350, 227) (4350,)
cutoff = 32
dict_keys(['manhattan', 'queens'])
staten_island
(290, 227) (290,)
cutoff = 30
dict_keys(['manhattan', 'queens', 'staten_island'])
bronx
(822, 227) (822,)
cutoff = 39
dict_keys(['manhattan', 'queens', 'bronx', 'staten_island'])
brooklyn
(19182, 227) (19182,)
cutoff = 28
dict_keys(['manhattan', 'queens', 'bronx', 'brooklyn', 'staten_island'])


In [4]:
results = {}
train_predictions = {}
test_predictions = {}
for room in sets:
    rows = []
    train_preds = []
    test_preds = []

    models = [XGBRegressor(objective='reg:squarederror', subsample=1.0, colsample_bytree=1.0, eta=0.05,eval_metric='rmse',max_depth=7,min_child_weight=7)]
    # models = [XGBRegressor(objective='reg:squarederror'), LinearRegression(), Ridge(), Lasso(alpha=1e-4), RandomForestRegressor()]
    for model in models:

        model.fit(sets[room]['X_train'], sets[room]['y_train'])

        y_train_pred = model.predict(sets[room]['X_train'])
        train_preds.append(y_train_pred)

        y_pred = model.predict(sets[room]['X_test'])
        test_preds.append(y_pred)

        rows.append([np.sqrt(mean_squared_error(np.expm1(sets[room]['y_test']), np.expm1(y_pred))),
                   r2_score(sets[room]['y_test'], y_pred),
                   median_absolute_error(np.expm1(sets[room]['y_test']), np.expm1(y_pred)),
                   mean_absolute_error(np.expm1(sets[room]['y_test']), np.expm1(y_pred)),
                   np.sqrt(mean_squared_error(np.expm1(sets[room]['y_train']), np.expm1(y_train_pred))),
                   r2_score(sets[room]['y_train'], y_train_pred),
                   median_absolute_error(np.expm1(sets[room]['y_train']), np.expm1(y_train_pred)),
                   mean_absolute_error(np.expm1(sets[room]['y_train']), np.expm1(y_train_pred))])
    results[room] = rows
    train_predictions[room] = train_preds
    test_predictions[room] = test_preds
    print(room, 'done')

  if getattr(data, 'base', None) is not None and \


manhattan done


  if getattr(data, 'base', None) is not None and \


queens done


  if getattr(data, 'base', None) is not None and \


bronx done


  if getattr(data, 'base', None) is not None and \


brooklyn done
staten_island done


  if getattr(data, 'base', None) is not None and \


In [8]:
print(np.expm1(sets['bronx']['y_test'].max()), np.expm1(sets['bronx']['y_test'].min()))
print(np.expm1(test_predictions['bronx'][0].max()), np.expm1(test_predictions['bronx'][0].min()))

pd.DataFrame(results['bronx'], columns=['Test RMSE', 'Test R2', 'Test MedianAE', 'Test MeanAE', 'Train RMSE', 'Train R2', 'Train MedianAE', 'Train MeanAE'])


99.00000000000004 10.000000000000002
96.32472 19.341097


Unnamed: 0,Test RMSE,Test R2,Test MedianAE,Test MeanAE,Train RMSE,Train R2,Train MedianAE,Train MeanAE
0,15.583014,0.35182,10.789948,12.425139,6.169827,0.909736,3.109474,4.29947


In [9]:
print(np.expm1(sets['manhattan']['y_test'].max()), np.expm1(sets['manhattan']['y_test'].min()))
print(np.expm1(test_predictions['manhattan'][0].max()), np.expm1(test_predictions['manhattan'][0].min()))

pd.DataFrame(results['manhattan'], columns=['Test RMSE', 'Test R2', 'Test MedianAE', 'Test MeanAE', 'Train RMSE', 'Train R2', 'Train MedianAE', 'Train MeanAE'])

398.9999999999999 10.000000000000002
322.3077 34.92592


Unnamed: 0,Test RMSE,Test R2,Test MedianAE,Test MeanAE,Train RMSE,Train R2,Train MedianAE,Train MeanAE
0,58.268342,0.564014,27.787445,40.963099,53.120491,0.660342,24.161488,36.800979


In [10]:
print(np.expm1(sets['brooklyn']['y_test'].max()), np.expm1(sets['brooklyn']['y_test'].min()))
print(np.expm1(test_predictions['brooklyn'][0].max()), np.expm1(test_predictions['brooklyn'][0].min()))

pd.DataFrame(results['brooklyn'], columns=['Test RMSE', 'Test R2', 'Test MedianAE', 'Test MeanAE', 'Train RMSE', 'Train R2', 'Train MedianAE', 'Train MeanAE'])

298.99999999999994 10.000000000000002
212.58353 25.328117


Unnamed: 0,Test RMSE,Test R2,Test MedianAE,Test MeanAE,Train RMSE,Train R2,Train MedianAE,Train MeanAE
0,40.810858,0.580073,18.441288,27.970079,36.1704,0.669295,16.197704,24.743869


In [11]:
print(np.expm1(sets['queens']['y_test'].max()), np.expm1(sets['queens']['y_test'].min()))
print(np.expm1(test_predictions['queens'][0].max()), np.expm1(test_predictions['queens'][0].min()))

pd.DataFrame(results['queens'], columns=['Test RMSE', 'Test R2', 'Test MedianAE', 'Test MeanAE', 'Train RMSE', 'Train R2', 'Train MedianAE', 'Train MeanAE'])

118.99999999999997 10.000000000000002
102.24492 25.70593


Unnamed: 0,Test RMSE,Test R2,Test MedianAE,Test MeanAE,Train RMSE,Train R2,Train MedianAE,Train MeanAE
0,18.836538,0.293638,11.450283,14.506634,13.581437,0.673228,8.143417,10.306921


In [12]:
print(np.expm1(sets['staten_island']['y_test'].max()), np.expm1(sets['staten_island']['y_test'].min()))
print(np.expm1(test_predictions['staten_island'][0].max()), np.expm1(test_predictions['staten_island'][0].min()))

pd.DataFrame(results['staten_island'], columns=['Test RMSE', 'Test R2', 'Test MedianAE', 'Test MeanAE', 'Train RMSE', 'Train R2', 'Train MedianAE', 'Train MeanAE'])

114.99999999999999 20.0
106.97508 25.987278


Unnamed: 0,Test RMSE,Test R2,Test MedianAE,Test MeanAE,Train RMSE,Train R2,Train MedianAE,Train MeanAE
0,19.482937,0.36408,12.763657,14.507064,4.238535,0.977062,1.601185,2.632404
