In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.pipeline import Pipeline
from tensorflow.keras import Sequential, regularizers

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Flatten, ReLU
from tensorflow.keras.activations import sigmoid, softmax, relu

from spacy.tokenizer import Tokenizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import joblib
from keras.models import load_model
import random

In [None]:
# import the two encoders needed for data preprocessing

In [None]:
# Tokenizer will be used to tokenize description text that is entered
# into the function

In [None]:
tokenizer = joblib.load('tokenizer.joblib')

In [None]:
# Encoder is used to LabelEncode the proerty types 

In [None]:
encoder = joblib.load('label_encoder.joblib')

In [None]:
# Neural network model used for predictions

In [None]:
combined = load_model('best_model.h5')

In [None]:
# Creating two examples to input into the function to produce a prediction

In [None]:
test_text = """
Daniels House has 2 bedrooms and accepts pets
"""

In [None]:
# List of all available labels that the neural network can calculate from
# original Dataset. Because of label encoding, these have to be an exact 
# match in order for the function to run

In [None]:
test_list = ['Apartment', 'Condominium', 'Loft',                             
             'House','Serviced apartment','Hostel',                
             'Townhouse', 'Guest suite', 'Bed and breakfast',
             'Guesthouse', 'Hotel', 'Other', 'Boutique hotel']

In [None]:
# using python Random to create a random selection for the purpose of testing

In [None]:
test = random.choice(test_list)

In [None]:
def predict(property_type, description_text):

  # For the model to predict, the labels must be in the format of a list
  # These to lines take the property type input and append them into a list
  # format

    test_type = []
    test_type.append(property_type)

  # Once in list format, the property type has to be passed through the label
  # encoder. In addition, once label encoded with then one-hot encode for the 
  # purpose of passing data through a wide format model
    
    building = encoder.transform(test_type)
    building = keras.utils.to_categorical(building, 13)

  # The next step embeds the description provided into tokens and bags of words
  # padding is use to preserve the demensionality of the data 

    max_seq_length = 170
    embed = tokenizer.texts_to_sequences(description_text)
    embed = keras.preprocessing.sequence.pad_sequences(
            embed, maxlen=max_seq_length, padding="post")
    
  # Data then has to be transformed into a text matrix to be processed by the 
  # neural network. In order to function the label encoded list must match 
  # dimensions. For this we transform the list to match the size of the 
  # description text. The last step is to turn the list into a numpy array.

    description_bow = tokenizer.texts_to_matrix(description_text)
    building_transform =[list(building[0]) for n in range(
                                                        description_bow.shape[0]
                                                        )]
    building = np.array(building_transform)

  # We run a predict pipeline using the inputs of the description bow, label
  # encoded building types, and the embedded works. This will return a numpy 
  # array of all possible predicitions. We use a f string to pull only the first
  # prediction from the array.
  
    predictions = combined.predict([description_bow, building] + [embed])
    val = predictions[0]

    return f'The estimated rent is ${"{:.2f}".format(val[0])} per night.'

In [None]:
predict(test, test_text)

'The estimated rent is $111.78 per night.'

## Methodology

Based on Britta Bettendorf's kaggle data "Berlin Airbnb Data", which contains information about Airbnb listings in Berlin, Germany in November 2018, we created a model that could predict the price of a hypothetical listing.
We chose to base our predictions off of two elements of an Airbnb listing: type of property and listing description.


In order to analyze and make predictions from the data, we first split the data into training (80 percent) and testing (20 percent) subdatasets. We fed the models the training data, and then tested the model's scores against the testing data.


Our model is composed of two neural networks: a wide model and a deep model. The wide model provides the breadth needed to do a proper evaluation of the data. The deep model, with its embedding layers, does the deep text analysis of the listing descriptions.
Insights:


Model should have a margin of error of $15. However, it should be noted that the model is not 100 percent accurate, since it does not take into account any currency fluctuations, nor the coronavirus pandemic.