In [30]:
!pip install category_encoders



In [31]:
import pandas as pd
import numpy as np
import os
import datetime
import tensorflow as tf
import category_encoders as ce
import math
import pickle

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [29]:
df = pd.read_csv("/content/drive/My Drive/train.csv")

In [32]:
def extract_zipcode(X):
  """Extracts first 5 characters from string"""
  return X[:5] 

In [33]:
def enumerate_amenities(X):
  """Returns sum of number of amenities"""
  return len(X["amenities"].split(","))

In [6]:
def wrangle(X):
  """
  Wrangles and cleans dataframe
  """

  # Prevent Setting With Copy warning
  X = X.copy()

  """
  Converting X["amenities"] to countable list, replacing set of amenities
  with count of amenities
  """
  X["amenities"] = X.apply(enumerate_amenities, axis=1)

  # Filtering dataframe
  X = X.filter(["log_price", "property_type", "amenities", "room_type",  
                "accommodates", "bathrooms", "cancellation_policy", 
                "cleaning_fee", "instant_bookable", "zipcode", "bedrooms", 
                "beds"], axis=1)
  
  """
  Converting property_type to include manageable number of options for
  ordinal encoding
  """

  X["property_type"] = X["property_type"].replace(np.nan, "Other")
  apartment = X["property_type"].str.contains("Apartment")
  house = X["property_type"].str.contains("House")
  loft = X["property_type"].str.contains("Loft")
  hostel = X["property_type"].str.contains("Hostel")
  condo = X["property_type"].str.contains("Condominium")
  townhouse = X["property_type"].str.contains("Townhouse")
  earth_house = X["property_type"].str.contains("Earth House")
  othr_conditional = ~apartment & ~house & ~loft & ~hostel & ~condo & ~townhouse
  X.loc[earth_house, "property_type"] = "Other"
  X.loc[othr_conditional, "property_type"] = "Other"

  # Cleaning zipcode column, extracting zip code
  X["zipcode"] = X["zipcode"].replace("Near 91304", 91304)
  X["zipcode"] = X["zipcode"].replace("1m", 10023)
  X["zipcode"] = pd.to_numeric(X["zipcode"], errors="coerce")
  X["zipcode"] = X["zipcode"].replace(np.nan, X["zipcode"].median())
  X["zipcode"] = X["zipcode"].astype(str)
  X["zipcode"] = X["zipcode"].apply(extract_zipcode)
  X["zipcode"] = X["zipcode"].replace(".", "")  
  X["zipcode"] = X["zipcode"].astype(float)

  # Replacing NaN values with median
  X["bathrooms"] = X["bathrooms"].replace(np.nan, X["bathrooms"].median())
  X["bedrooms"] = X["bedrooms"].replace(np.nan, X["bedrooms"].median())
  X["beds"] = X["beds"].replace(np.nan, X["beds"].median())

  # Encoding categorical variables
  encoder = ce.OrdinalEncoder()
  X = encoder.fit_transform(X)

  # Converting data to integers for seamless entry into neural network
  X = X.astype(float)

  return X

In [7]:
# Applying wrangle function

df = wrangle(df)

In [8]:
# Splitting data set into training and test sets

train, test = train_test_split(df, test_size=0.2, random_state=7)

In [9]:
# Splitting train and test sets into X feature matrix and y target vector

target = "log_price"

X_train = train.drop(columns=target)
y_train = train[target]

X_test = test.drop(columns=target)
y_test = test[target]

In [None]:
# Building neural network architecture

model = Sequential([
    Dense(512, input_dim=11, activation= 'relu'),
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dense(64, activation='swish'),
    Dense(32, activation='swish'),
    Dense(16, activation='swish'),
    Dense(1, activation = 'linear')
    ])

model.compile(loss='mean_squared_error', optimizer='adam')

In [13]:
# Fitting NN to data

model.fit(x=X_train, 
          y=y_train, 
          epochs=50, 
          validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fb65c562320>

In [11]:
def predict_price(X):
  """
  Uses model to predict price based on inputted features
  """
  return math.exp(model.predict(X))


In [92]:
# TODO: Calculate MSE. Try different models, probably Random Forest

# all_output = model.predict(X_train)
# error_all = (y_train - all_output) / len(X_train)

In [14]:
# Saving weights and architecture of NN

model.save("airbnb_NN")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: airbnb_NN/assets


In [20]:
# Recreating model as a sanity check

reconstructed_model = tf.keras.models.load_model("/content/drive/My Drive/airbnb_NN/")



In [35]:
# Checking if reconstructed_model makes predictions

z = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

math.exp(reconstructed_model.predict(z))

127.6150835751411