In [1]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import model_from_json
from keras import regularizers

tf.keras.backend.set_floatx('float32')

In [2]:
# read in data:
df = pd.read_csv('/content/drive/MyDrive/AB_US_2020.csv')
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(226030, 17)

In [3]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'city'],
      dtype='object')

In [4]:
# Drop Unusable columns
columns = ['id',
           'neighbourhood_group',
           'last_review',
           'reviews_per_month',
           'name',
           'host_name',
           'latitude',
           'longitude',
           'number_of_reviews',
           'host_id',
           'calculated_host_listings_count']

df = df.drop(columns, axis=1)
df.shape

(226030, 6)

In [5]:
df

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,availability_365,city
0,28804,Private room,60,1,0,Asheville
1,28801,Entire home/apt,470,1,288,Asheville
2,28801,Entire home/apt,75,30,298,Asheville
3,28806,Entire home/apt,90,1,0,Asheville
4,28801,Private room,125,30,0,Asheville
...,...,...,...,...,...,...
226025,"Downtown, Chinatown, Penn Quarters, Mount Vern...",Entire home/apt,104,1,99,Washington D.C.
226026,"Brookland, Brentwood, Langdon",Entire home/apt,151,2,300,Washington D.C.
226027,"Shaw, Logan Circle",Entire home/apt,240,2,173,Washington D.C.
226028,"Kalorama Heights, Adams Morgan, Lanier Heights",Entire home/apt,60,21,362,Washington D.C.


In [6]:
# factorize non-numeric categorical data,
# simple process on data for preparation to train

cat_cols = ['neighbourhood', 'room_type', 'city']

# enc = OrdinalEncoder()


# df['neighbourhood'] = pd.factorize(df.neighbourhood)[0]
# df['room_type'] = pd.factorize(df.room_type)[0]
# df['city'] = pd.factorize(df.city)[0]
df['price'] = df['price'].astype(float)

df = df[df['price'] <= 501]

df.shape

(212300, 6)

In [7]:
df.isnull().sum()

neighbourhood       0
room_type           0
price               0
minimum_nights      0
availability_365    0
city                0
dtype: int64

In [35]:
df1 = df.groupby('city').neighbourhood.apply(list).reset_index(name='neighborhood_list')
df1.to_csv('city_neighborhood.csv')
# df1

In [8]:
MMS = MinMaxScaler()
enc = OrdinalEncoder()

dfcat = df[cat_cols]
dfcat = enc.fit_transform(dfcat)

In [9]:
dfcat

array([[   5.,    2.,    0.],
       [   3.,    0.,    0.],
       [   3.,    0.,    0.],
       ...,
       [1158.,    0.,   27.],
       [ 675.,    0.,   27.],
       [ 448.,    0.,   27.]])

In [10]:
dfnum = df[['availability_365', 'minimum_nights']].values

In [11]:
dfnum

array([[  0,   1],
       [288,   1],
       [298,  30],
       ...,
       [173,   2],
       [362,  21],
       [ 62,   7]])

In [12]:
df_joined = np.append(dfcat, dfnum, axis=1)
df_joined

array([[5.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 1.000e+00],
       [3.000e+00, 0.000e+00, 0.000e+00, 2.880e+02, 1.000e+00],
       [3.000e+00, 0.000e+00, 0.000e+00, 2.980e+02, 3.000e+01],
       ...,
       [1.158e+03, 0.000e+00, 2.700e+01, 1.730e+02, 2.000e+00],
       [6.750e+02, 0.000e+00, 2.700e+01, 3.620e+02, 2.100e+01],
       [4.480e+02, 0.000e+00, 2.700e+01, 6.200e+01, 7.000e+00]])

In [13]:
df_scaled = MMS.fit_transform(df_joined)

In [14]:
df_scaled

array([[3.45542502e-03, 6.66666667e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.07325501e-03, 0.00000000e+00, 0.00000000e+00, 7.89041096e-01,
        0.00000000e+00],
       [2.07325501e-03, 0.00000000e+00, 0.00000000e+00, 8.16438356e-01,
        2.90000003e-07],
       ...,
       [8.00276434e-01, 0.00000000e+00, 1.00000000e+00, 4.73972603e-01,
        1.00000001e-08],
       [4.66482377e-01, 0.00000000e+00, 1.00000000e+00, 9.91780822e-01,
        2.00000002e-07],
       [3.09606082e-01, 0.00000000e+00, 1.00000000e+00, 1.69863014e-01,
        6.00000006e-08]])

In [15]:
# Set target variables for training
target = ['price']
df_low = df.drop(target, axis=1)
X = df_scaled
Y = df[target].values

print(X.shape, Y.shape)

(212300, 5) (212300, 1)


In [16]:
X

array([[3.45542502e-03, 6.66666667e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.07325501e-03, 0.00000000e+00, 0.00000000e+00, 7.89041096e-01,
        0.00000000e+00],
       [2.07325501e-03, 0.00000000e+00, 0.00000000e+00, 8.16438356e-01,
        2.90000003e-07],
       ...,
       [8.00276434e-01, 0.00000000e+00, 1.00000000e+00, 4.73972603e-01,
        1.00000001e-08],
       [4.66482377e-01, 0.00000000e+00, 1.00000000e+00, 9.91780822e-01,
        2.00000002e-07],
       [3.09606082e-01, 0.00000000e+00, 1.00000000e+00, 1.69863014e-01,
        6.00000006e-08]])

In [17]:
# Train/Test Split:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(169840, 5) (42460, 5)
(169840, 1) (42460, 1)


In [18]:
# Set regularization and NN features:
learn_rate_reducer = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.2, min_lr=0.001, cooldown=3)
early_stop = tf.keras.callbacks.EarlyStopping(patience=6, restore_best_weights=True)

callbacks = [learn_rate_reducer, early_stop]

rmse = tf.keras.metrics.RootMeanSquaredError()

In [76]:
# Model:

nn = Sequential()

nn.add(Dense(128, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.005), activation='relu'))
nn.add(Dropout(0.25))
nn.add(Dense(256, kernel_regularizer=regularizers.l2(0.005), activation='relu'))
nn.add(Dense(512, kernel_regularizer=regularizers.l2(0.05), activation='relu'))
nn.add(Dropout(0.25))
nn.add(Dense(128, kernel_regularizer=regularizers.l2(0.005), activation='relu'))
nn.add(Dropout(0.25))
nn.add(Dense(64, kernel_regularizer=regularizers.l1(0.005), activation='relu'))
nn.add(Dense(1, activation='linear'))

nn.compile(loss='mean_squared_error',
           optimizer='nadam',
           metrics=['mean_absolute_error', rmse])

# Model summary:
print(nn.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_54 (Dense)             (None, 128)               768       
_________________________________________________________________
dropout_27 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_55 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_56 (Dense)             (None, 512)               131584    
_________________________________________________________________
dropout_28 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_57 (Dense)             (None, 128)               65664     
_________________________________________________________________
dropout_29 (Dropout)         (None, 128)              

In [77]:
history = nn.fit(X_train,
                 y_train,
                 epochs=40,
                 batch_size=256,
                 validation_data=(X_test, y_test),
                 callbacks=callbacks)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40


In [78]:
# Serialize Model to JSON:
import joblib

np.random.seed(12)

model_json = nn.to_json()
with open("nn.json", "w") as json_file:
  json_file.write(model_json)

nn.save_weights('nn.h5')

joblib.dump(MMS, 'MMS.gz')
joblib.dump(enc, 'encoder.gz')


['encoder.gz']

In [79]:
# Load Serialized Model to Reuse:

json_file = open('nn.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights('nn.h5')

In [53]:
# # Test on prediction:

# pred = np.array([[0,2,2,300,20]])
# prediction = data_pipeline.transform(pred)
# # transformed_prediction = MMS.transform(encoded_prediction)

In [82]:
# # Original Model:

# nn.predict(transformed_prediction)[0][0]

In [54]:
# # Loaded Model w/ saved weights:

# loaded_model.predict(transformed_prediction)[0][0]


In [80]:
import joblib

def predict_opt_price(neighborhood, room_type, city, availability_365, minimum_nights):
  # Import weights and load model to predict:
  json_file = open('nn.json', 'r')
  loaded_model_json = json_file.read()
  json_file.close()

  loaded_model = model_from_json(loaded_model_json)
  loaded_model.load_weights('nn.h5')

  # Load Transformations:
  MMS = joblib.load('MMS.gz')
  enc = joblib.load('encoder.gz')
  # raw_data = np.array([[neighborhood, room_type, minimum_nights, availability_365, city]])
  catcols = np.array([[neighborhood, room_type, city]])
  numcols = np.array([[availability_365, minimum_nights]])

  s1 = enc.transform(catcols)

  array_joined = np.append(s1, numcols, axis=1)

  transformed = MMS.transform(array_joined)

  prediction = loaded_model.predict(transformed)[0][0]

  return prediction

In [81]:
predict_opt_price(neighborhood='Glocester', room_type='Entire home/apt', minimum_nights=2, availability_365=300, city='Rhode Island')

174.25793

In [46]:
# enc.categories_

[array(['28704', '28715', '28732', ..., 'Wright', 'Yesler Terrace',
        'Yosemite Dr'], dtype=object),
 array(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room'],
       dtype=object),
 array(['Asheville', 'Austin', 'Boston', 'Broward County', 'Cambridge',
        'Chicago', 'Clark County', 'Columbus', 'Denver', 'Hawaii',
        'Jersey City', 'Los Angeles', 'Nashville', 'New Orleans',
        'New York City', 'Oakland', 'Pacific Grove', 'Portland',
        'Rhode Island', 'Salem', 'San Clara Country', 'San Diego',
        'San Francisco', 'San Mateo County', 'Santa Cruz County',
        'Seattle', 'Twin Cities MSA', 'Washington D.C.'], dtype=object)]

In [48]:
df1

Unnamed: 0,city,neighborhood_list
0,Asheville,"[28804, 28801, 28801, 28806, 28801, 28804, 288..."
1,Austin,"[78702, 78702, 78702, 78704, 78729, 78704, 787..."
2,Boston,"[East Boston, Roxbury, Roxbury, Downtown, Back..."
3,Broward County,"[Fort Lauderdale, Hollywood, Hallandale Beach,..."
4,Cambridge,"[West Cambridge, North Cambridge, North Cambri..."
5,Chicago,"[Hyde Park, South Lawndale, West Town, Lincoln..."
6,Clark County,"[Unincorporated Areas, Unincorporated Areas, U..."
7,Columbus,"[Near North/University, Near North/University,..."
8,Denver,"[Virginia Village, Highland, Five Points, Nort..."
9,Hawaii,"[Hamakua, South Kohala, South Kona, North Kona..."
