## Preparing Data for Training

In [1]:
import pandas as pd
import numpy as np

In [2]:
airbnb = pd.read_csv('cleaned_airbnb.csv')

In [3]:
airbnb.head(3)

Unnamed: 0,space,description,city,state,room_type,price,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating
0,"I like to call my place ""the penthouse"". It's ...",I like to think of my place as a cozy escape. ...,BROOKLYN,NY,Entire home/apt,87.0,0.0,1,0.0,4,1125,100.0
1,This room is one of two bedrooms open to book...,"Gorgeous, spacious, clean & cozy bedroom in th...",NEW YORK,NY,Private room,100.0,0.0,1,25.0,2,25,96.0
2,Our craftsman style home was aesthetically rem...,This cozy two bedroom apt is well lit. Then ap...,SEATTLE,WA,Entire home/apt,175.0,250.0,2,10.0,3,1125,100.0


In [21]:
df = airbnb.drop(['space', 'description'], axis=1)

In [22]:
df.head(3)

Unnamed: 0,city,state,room_type,price,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating
0,BROOKLYN,NY,Entire home/apt,87.0,0.0,1,0.0,4,1125,100.0
1,NEW YORK,NY,Private room,100.0,0.0,1,25.0,2,25,96.0
2,SEATTLE,WA,Entire home/apt,175.0,250.0,2,10.0,3,1125,100.0


In [8]:
target = 'price'

one_hot_encode_features = ['room_type', 'city', 'state', 'guests_included']

price_features = ['security_deposit', 'extra_people'] 

other_features = ['minimum_nights', 'maximum_nights', 'review_scores_rating']

features = one_hot_encode_features + price_features + other_features

In [23]:
df['room_type'].value_counts()

Entire home/apt    92795
Private room       37731
Shared room         2102
Hotel room           957
Name: room_type, dtype: int64

In [24]:
df['guests_included'].value_counts()

1     73567
2     30796
4     12752
6      5672
3      4135
8      2132
5      2057
10      989
7       551
12      347
16      194
9       148
14       84
11       55
15       45
13       31
20        8
24        5
18        4
19        2
21        2
28        2
35        1
17        1
22        1
29        1
30        1
32        1
36        1
Name: guests_included, dtype: int64

In [25]:
df['review_scores_rating'].value_counts()

100.0    36412
98.0     15098
99.0     13710
97.0     12053
96.0     10147
95.0      8375
93.0      6337
94.0      5668
90.0      4385
92.0      3617
80.0      3069
91.0      2872
89.0      1813
87.0      1711
88.0      1670
85.0       901
86.0       795
60.0       771
84.0       740
83.0       607
82.0       327
70.0       321
73.0       306
20.0       268
75.0       198
40.0       177
81.0       161
78.0       160
76.0       138
77.0       133
67.0       104
79.0        92
50.0        78
74.0        56
72.0        50
71.0        45
65.0        42
68.0        38
69.0        27
64.0        17
53.0        17
63.0        15
66.0        14
30.0        13
47.0         8
62.0         6
56.0         5
57.0         4
58.0         3
61.0         2
55.0         2
45.0         2
43.0         1
59.0         1
33.0         1
48.0         1
52.0         1
Name: review_scores_rating, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
X = df.drop(['price'], axis=1)
y = df.drop(features, axis=1)

X.shape, y.shape

((133585, 9), (133585, 1))

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
X_train.head(3)

Unnamed: 0,city,state,room_type,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating
4128,BROOKLYN,NY,Entire home/apt,250.0,1,0.0,2,1125,100.0
122022,NEW YORK,NY,Entire home/apt,0.0,1,0.0,1,2,100.0
113213,NASHVILLE,TN,Entire home/apt,300.0,2,55.0,28,1125,99.0


In [29]:
# onehot encode
onehot = OneHotEncoder([])
ordinal = OrdinalEncoder()

# one hot encode train
transformed_train = onehot.fit_transform(X_train[['state', 'room_type']])
X_train.loc[:, 'city'] = ordinal.fit_transform(X_train['city'])

# one hot encode test
transformed_test = onehot.transform(X_test[['state', 'room_type']])
X_test.loc[:, 'city'] = ordinal.transform(X_test['city'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [30]:
X_train = pd.concat([X_train, transformed_train], axis=1)

In [31]:
X_test = pd.concat([X_test, transformed_test], axis=1)

In [33]:
X_train.head(3)

Unnamed: 0,city,state,room_type,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating,state_1,...,state_11,state_12,state_13,state_14,state_15,state_16,room_type_1,room_type_2,room_type_3,room_type_4
4128,1,NY,Entire home/apt,250.0,1,0.0,2,1125,100.0,1,...,0,0,0,0,0,0,1,0,0,0
122022,2,NY,Entire home/apt,0.0,1,0.0,1,2,100.0,1,...,0,0,0,0,0,0,1,0,0,0
113213,3,TN,Entire home/apt,300.0,2,55.0,28,1125,99.0,0,...,0,0,0,0,0,0,1,0,0,0


In [34]:
X_train.drop(columns=['state', 'room_type'], axis=1, inplace=True)
X_test.drop(columns=['state', 'room_type'], axis=1, inplace=True)

In [35]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((106868, 27), (106868, 1), (26717, 27), (26717, 1))

In [36]:
X_train.columns

Index(['city', 'security_deposit', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'review_scores_rating', 'state_1',
       'state_2', 'state_3', 'state_4', 'state_5', 'state_6', 'state_7',
       'state_8', 'state_9', 'state_10', 'state_11', 'state_12', 'state_13',
       'state_14', 'state_15', 'state_16', 'room_type_1', 'room_type_2',
       'room_type_3', 'room_type_4'],
      dtype='object')

## Train Model

In [37]:
import os
import datetime
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

In [46]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [39]:
X_train.head()

Unnamed: 0,city,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating,state_1,state_2,state_3,...,state_11,state_12,state_13,state_14,state_15,state_16,room_type_1,room_type_2,room_type_3,room_type_4
4128,1,250.0,1,0.0,2,1125,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
122022,2,0.0,1,0.0,1,2,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
113213,3,300.0,2,55.0,28,1125,99.0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
26550,1,0.0,1,0.0,2,4,93.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
91062,4,0.0,1,0.0,5,365,90.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [40]:
# reset index for conctenation with dtm matrices

X_train_new = X_train.reset_index()
X_train_new.drop(columns=['index'], inplace=True)
X_train_new.head()

Unnamed: 0,city,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating,state_1,state_2,state_3,...,state_11,state_12,state_13,state_14,state_15,state_16,room_type_1,room_type_2,room_type_3,room_type_4
0,1,250.0,1,0.0,2,1125,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,0.0,1,0.0,1,2,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,300.0,2,55.0,28,1125,99.0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0.0,1,0.0,2,4,93.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0.0,1,0.0,5,365,90.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [41]:
# reset index for conctenation with dtm matrices

X_test_new = X_test.reset_index()
X_test_new.drop(columns=['index'], inplace=True)
X_train_new.head()

Unnamed: 0,city,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating,state_1,state_2,state_3,...,state_11,state_12,state_13,state_14,state_15,state_16,room_type_1,room_type_2,room_type_3,room_type_4
0,1,250.0,1,0.0,2,1125,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,0.0,1,0.0,1,2,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,300.0,2,55.0,28,1125,99.0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0.0,1,0.0,2,4,93.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0.0,1,0.0,5,365,90.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [42]:
X_train_new.head()

Unnamed: 0,city,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating,state_1,state_2,state_3,...,state_11,state_12,state_13,state_14,state_15,state_16,room_type_1,room_type_2,room_type_3,room_type_4
0,1,250.0,1,0.0,2,1125,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,0.0,1,0.0,1,2,100.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,300.0,2,55.0,28,1125,99.0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0.0,1,0.0,2,4,93.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0.0,1,0.0,5,365,90.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [43]:
# scale and normalize data

from sklearn.preprocessing import MinMaxScaler, Normalizer

scaler = MinMaxScaler()
normalize = Normalizer()

scaled_X_train = scaler.fit_transform(X_train_new)
scaled_X_test = scaler.fit_transform(X_test_new)

normalized_X_train = normalize.fit_transform(scaled_X_train)
normalized_X_test = normalize.transform(scaled_X_test)

In [44]:
normalized_X_train.shape, normalized_X_test.shape

((106868, 27), (26717, 27))

In [47]:
model = Sequential([
    Flatten(),
    Dense(256, input_dim=10027, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(1)
])

optimizer = tf.keras.optimizers.RMSprop(0.001)

epochs = 1000

stop = EarlyStopping(monitor='val_mae', min_delta=0.01, patience=3)

model.compile(loss='mse', optimizer=optimizer, metrics=['mse','mae'])

model.fit(normalized_X_train, y_train.values, epochs=epochs, 
          validation_data=(normalized_X_test, y_test.values),
          verbose=1,
          callbacks=[tensorboard_callback, stop])

Epoch 1/1000
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000


<tensorflow.python.keras.callbacks.History at 0x7f833f1a3550>

In [48]:
%tensorboard --logdir logs

UsageError: Line magic function `%tensorboard` not found.


In [50]:
model.save('bare_model.h5')