## Importing libraries and dataset

In [208]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [209]:
# Importing the 2019 Austen, TX airbnb lisings
df = pd.read_csv('./austen_listings.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning features

In [210]:
# These are the features I will be using to make predictions. It makes the most sense to ask users to input these values
features = ['bedrooms', 'bathrooms', 'zipcode', 'property_type', 'room_type']
df[features] = df[features].fillna(method='ffill')

In [211]:
# Price will be the target class. I am removing the $ and , so I can turn it into a float

def price_to_float(price):
    price = price.replace('$', '')
    price = price.replace(',', '')
    float(price)
    return price

df['price'] = df['price'].apply(price_to_float)
df['price'] = pd.to_numeric(df['price'])

In [212]:
# Zipcode has some weird characters, and I want it as an int so '78702' and 'TX 78702.0' are recognized as the same zipcode
def zipcode_to_int(zipcode):
    zipcode = str(zipcode)
    zipcode = zipcode.replace('TX', '')
    zipcode = zipcode.replace(' ', '')
    zipcode = zipcode.replace('.0', '')
    zipcode = int(zipcode)
    return zipcode

df['zipcode'] = df['zipcode'].apply(zipcode_to_int)

## Encoding categorical features

In [213]:
# Separating the numeric and categorical features
num_features = ['bedrooms', 'bathrooms'] 
cat_features = ['zipcode', 'property_type', 'room_type']

In [214]:
# Creating seperate dataframes for ease of encoding
df_num = df[num_features]
df_cat = df[cat_features]

In [215]:
# Changing all property types below the top 5 most common types to 'Other'

property_type_list = ['House', 'Apartment', 'Condominium', 'Guesthouse', 'Townhouse']

def clean_properties(item):
    if item not in property_type_list:
        return 'Other'
    else:
        return item

df_cat['property_type'] = df_cat['property_type'].apply(clean_properties)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [216]:
# OneHotEncoding the categorical features
encoder = OneHotEncoder(categories='auto',)

encoded_cat = encoder.fit_transform(df_cat)

In [217]:
# Changing it into a numpy array for concatenation
encoded_cat_array = encoded_cat.toarray()

In [218]:
# Changing the numeric dataframe into a numpy array and concatenating the above array into it
num_array = np.array(df_num)

final_array = np.concatenate((num_array, encoded_cat_array), axis=1)

In [219]:
# Finding out what my input dimensions will be
final_array.shape

(11250, 65)

# Final model

In [220]:
# Making a sequential neural network with one hidden layer
model = Sequential()

model.add(Dense(65, input_dim=65, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

model.compile(loss='MSE', optimizer='adam', metrics=['mean_squared_error'])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 65)                4290      
_________________________________________________________________
dense_16 (Dense)             (None, 32)                2112      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 33        
Total params: 6,435
Trainable params: 6,435
Non-trainable params: 0
_________________________________________________________________


In [221]:
# Callbacks for early stopping while training
callbacks = [EarlyStopping(monitor='val_loss', patience=100),
            ModelCheckpoint(filepath='final_model.h5', monitor='val_loss', save_best_only=True)]

In [223]:
# Training the model
model.fit(final_array,
          target_array,
          epochs=5000, 
          verbose=0, 
          validation_split=.1,
          callbacks=callbacks)

<tensorflow.python.keras.callbacks.History at 0x1ebaa5c33c8>

In [190]:
# Testing the predictions of the model
test = np.array([final_array[0]])

model.predict(test)

array([[363.86542]], dtype=float32)

In [33]:
# Saving the model to be loaded for the flask app
model.save('baseline_model.h5')