**Data Exploration**

In [39]:
# Utility Imports
import pandas as pd
import numpy as np

# Visualization Imports
import seaborn as sns
import matplotlib.pyplot as plt

# Mode-building Imports
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.preprocessing import OrdinalEncoder
from datetime import datetime
# Data preperation class
from model_tools_class import mt

# Reading in the data.
df = pd.read_csv('../data/train.csv')

def wrangle(df):
    '''
    This function wranlges/prepares the data for usage in the model.
    '''

    # The daily price of the AirBnb listing.
    df['price'] = df['log_price'].apply(lambda x : round(np.exp(x)))

    # Cleaning the amenities column.
    df['amenities'] = df['amenities'].apply(mt.clean)
    df['amenities'] = df['amenities'].apply(lambda x: len(x))

    # Applying the get_days function to create a new feature = total number of hosted days.
    df['host_since_days'] = df['host_since'].apply(mt.get_days)

    # Dropping redundant columns.
    df.drop(columns=['host_since','log_price','id','latitude','longitude','name','description','thumbnail_url','review_scores_rating','number_of_reviews','host_has_profile_pic','host_response_rate','last_review','first_review'],inplace=True)

    # A new column: Is the room isntantly bookable.
    df['instant_bookable'].replace(('f','t'),("False",'True'),inplace=True)

    # A new column: is the hosts' identity verified.
    df['host_identity_verified'].replace(('f','t'),("False",'True'),inplace=True)

    # Dropping null values
    df = df.dropna(axis=0)
    
    return df

df = wrangle(df)

In [40]:
# Splitting the data
Y = df['price']
X = df.drop(columns=['price'])

# moving price to the back of df
df['target/price'] = df['price']
df.drop(columns=['price'],inplace=True)

**Visualizations**

In [41]:
# Plotting the targeted features.
#X.hist(figsize=(9,9),bins = 20)

In [42]:
# Heatmap visualization

x = df.corr()
figure = plt.figure(figsize=(10,10))
#sns.heatmap(x,vmax=0.65,square=True)
#plt.show()

<Figure size 720x720 with 0 Axes>

**Model Creation**

In [43]:
X_strings = X.select_dtypes(include='object')
X_non_strings = X.select_dtypes(include=['int64','float'])
oe =OrdinalEncoder()
oe.fit(X_strings)
X_train = oe.transform(X_strings)

In [44]:
X_train_whole = np.concatenate((X_train,X_non_strings),axis=1)

model_columns = X_strings.columns.append(X_non_strings.columns)

In [45]:
model_columns

Index(['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city',
       'host_identity_verified', 'instant_bookable', 'neighbourhood',
       'zipcode', 'amenities', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'host_since_days'],
      dtype='object')

In [46]:
input_dims = len(X_train_whole[0])

# Instantiating the model 
model = Sequential()

# Model Layers
model.add(Dense(input_dims,input_dim=input_dims, activation="relu", kernel_initializer='normal'))
model.add(Dense(1,activation="linear",kernel_initializer='normal'))

# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

# Fitting the data to the model
model.fit(X_train_whole,Y,epochs=10, batch_size=32, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2850157ba30>

In [47]:

def get_confirm_df(input_list_objects,input_list_numbers,string_value_list):
    df_rows = []
    for i in np.arange(9):
        df_rows.append((model_columns[i],string_value_list[i],input_list_objects[0][i]))
    for i in np.arange(9,len(model_columns)):
        df_rows.append((model_columns[i],input_list_numbers[i-9],input_list_numbers[i-9]))
    confirm_df = pd.DataFrame(df_rows,columns=['Variable','Value','Encoded'])
    return confirm_df


In [48]:
from datetime import datetime
def get_prediction(df):
    string_variable_list = ['property_type','room_type','bed_type',
                   'cancellation_policy','city','host_identity_verified',
                   'instant_bookable','neighbourhood','zipcode']
    number_variable_list = ['amenities','accommodates','bathrooms','beds','bedrooms','host_since_days']
    number_value_list = []
    string_value_list = []
    for x in string_variable_list:
        string_value_list.append(df[x])
    for x in number_variable_list:
        if type(df[x]) != str and type(df[x]) != np.int64 and type(df[x]) != np.float64 and type(df[x]) != np.bool:
            number_value_list.append(mt.get_days(df[x]))
        else:
            number_value_list.append(df[x])
    string_vectorized= oe.transform(np.array(string_value_list).reshape(1,-1))
    whole_input_vector = string_vectorized[0].tolist() + number_value_list
    confirm_df = get_confirm_df(string_vectorized,number_value_list,string_value_list)
    
    prediction = model.predict(np.array(whole_input_vector).reshape(1,-1))
    return prediction[0][0],confirm_df

In [49]:
predict, df_predict = get_prediction(test_df)

In [50]:
df_predict

Unnamed: 0,Variable,Value,Encoded
0,property_type,Apartment,0.0
1,room_type,Entire home/apt,0.0
2,bed_type,Real Bed,4.0
3,cancellation_policy,strict,2.0
4,city,NYC,4.0
5,host_identity_verified,True,1.0
6,instant_bookable,False,0.0
7,neighbourhood,Brooklyn Heights,76.0
8,zipcode,11201,191.0
9,amenities,8,8.0


In [51]:
predict

143.05702