In [1]:
import pandas as pd
import numpy as np

In [332]:
airbnb = pd.read_csv('cleaned_airbnb.csv')

In [333]:
pd.set_option('display.max_columns', len(airbnb.columns)) # To view all columns
pd.set_option('display.max_rows', 300)

In [334]:
airbnb.shape

(133585, 12)

In [335]:
df = airbnb.sample(frac=0.5, axis=0, random_state=1)

In [336]:
df.shape

(66792, 12)

In [337]:
df.head(2)

Unnamed: 0,space,description,city,state,room_type,price,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,review_scores_rating
91878,"My house is a two bedroom apartment, with a sp...","You’ll love my place because of the light, the...",LOS ANGELES,CA,Private room,55.0,0.0,1,0.0,2,30,100.0
39188,This is a stylish apartment located in the upp...,Great location in upper east near Central Park...,NEW YORK,NY,Entire home/apt,199.0,1000.0,1,0.0,1,1125,91.0


In [338]:
df = df[df.duplicated(subset=["city"], keep=False)]

In [339]:
df = df[df['city'] != '沃尔纳特']
df = df[df['city'] != '阿罕布拉']
df = df[df['city'] != '纽约市']
df = df[df['city'] != '纽约']

In [340]:
df = df.drop(columns=['space', 'description', 'state', 'review_scores_rating'])

In [341]:
df.head(2)

Unnamed: 0,city,room_type,price,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights
91878,LOS ANGELES,Private room,55.0,0.0,1,0.0,2,30
39188,NEW YORK,Entire home/apt,199.0,1000.0,1,0.0,1,1125


In [342]:
df['guests_included'].value_counts()

1     36478
2     15334
4      6393
6      2855
3      2088
8      1084
5      1000
10      504
7       281
12      166
16       98
9        76
14       45
11       32
15       22
13       13
18        2
20        2
28        2
29        1
19        1
21        1
22        1
32        1
Name: guests_included, dtype: int64

In [343]:
df['extra_people'].value_counts()

0.0      31634
25.0      6021
10.0      5725
20.0      5190
15.0      4041
50.0      3402
30.0      1982
5.0       1450
35.0      1170
40.0       936
100.0      814
75.0       461
45.0       349
12.0       298
60.0       211
8.0        185
7.0        181
300.0      158
19.0       155
18.0       135
29.0       133
150.0      116
200.0      112
9.0         97
55.0        88
65.0        86
17.0        85
11.0        77
80.0        74
49.0        74
16.0        70
6.0         69
39.0        62
14.0        60
70.0        53
28.0        51
85.0        42
250.0       37
22.0        36
99.0        35
33.0        33
13.0        33
24.0        32
23.0        28
125.0       26
32.0        24
38.0        24
95.0        20
90.0        20
34.0        18
27.0        18
37.0        16
36.0        14
26.0        12
69.0        12
31.0        11
120.0       10
21.0        10
59.0         9
46.0         8
58.0         8
47.0         8
87.0         7
175.0        7
79.0         7
44.0         6
41.0      

In [344]:
df.dtypes

city                 object
room_type            object
price               float64
security_deposit    float64
guests_included       int64
extra_people        float64
minimum_nights        int64
maximum_nights        int64
dtype: object

In [345]:
target = 'price'
features = ['city', 'room_type', 'security_deposit', 'guests_included', 'minimum_nights']

In [346]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

In [347]:
X = df.drop(['price', 'extra_people', 'maximum_nights'], axis=1)
y = df.drop(features+['extra_people', 'maximum_nights'], axis=1)

X.shape, y.shape

((66480, 5), (66480, 1))

In [348]:
X

Unnamed: 0,city,room_type,security_deposit,guests_included,minimum_nights
91878,LOS ANGELES,Private room,0.0,1,2
39188,NEW YORK,Entire home/apt,1000.0,1,1
129595,NEW YORK,Entire home/apt,450.0,2,5
95079,KAILUA-KONA,Entire home/apt,500.0,1,2
79223,BROOKLYN,Entire home/apt,100.0,4,4
...,...,...,...,...,...
109168,LOS ANGELES,Entire home/apt,495.0,2,60
3584,OAKLAND,Private room,250.0,1,1
41434,DENVER,Entire home/apt,0.0,2,1
90716,SEATTLE,Entire home/apt,150.0,1,30


In [349]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [350]:
X_train.head(3)

Unnamed: 0,city,room_type,security_deposit,guests_included,minimum_nights
103943,ROWLAND HEIGHTS,Private room,150.0,1,1
18134,NEW ORLEANS,Entire home/apt,0.0,1,30
75745,AUSTIN,Private room,0.0,1,1


In [351]:
X_train["text_combined"] = X_train[['city', 'room_type']].apply(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [352]:
X_test["text_combined"] = X_test[['city', 'room_type']].apply(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [353]:
X_train['text_combined']

103943     ROWLAND HEIGHTS Private room
18134       NEW ORLEANS Entire home/apt
75745               AUSTIN Private room
131277          CHICAGO Entire home/apt
69170           HALEIWA Entire home/apt
                      ...              
37488            NEW ORLEANS Hotel room
78380     BEVERLY HILLS Entire home/apt
93044          NEW ORLEANS Private room
26344          BROOKLYN Entire home/apt
96395            DALY CITY Private room
Name: text_combined, Length: 53184, dtype: object

In [354]:
X_test["text_combined"]

34319          AUSTIN Entire home/apt
96309        LOS ANGELES Private room
89301     NEW ORLEANS Entire home/apt
40589           BROOKLYN Private room
17355        NEW YORK Entire home/apt
                     ...             
42861           NEW YORK Private room
24689        SAN JOSE Entire home/apt
16697           NEW YORK Private room
46056       SAN DIEGO Entire home/apt
116452        CHICAGO Entire home/apt
Name: text_combined, Length: 13296, dtype: object

In [355]:
tfidf = TfidfVectorizer(
    strip_accents="unicode",
#     max_features=100,
    norm='l2',
    stop_words='english'
    )

In [356]:
dtm1 = tfidf.fit_transform(X_train['text_combined'])

In [357]:
dtm1

<53184x482 sparse matrix of type '<class 'numpy.float64'>'
	with 220018 stored elements in Compressed Sparse Row format>

In [358]:
dtm1_df = pd.DataFrame(dtm1.todense(), columns=tfidf.get_feature_names())
print('Shape: ', dtm1_df.shape)
dtm1_df.head(2)

Shape:  (53184, 482)


Unnamed: 0,afton,agoura,agua,aiea,albany,alhambra,...,woodland,woodside,worthington,wrightwood,york,zimmerman
0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0


In [359]:
dtm2 = tfidf.transform(X_test['text_combined'])

In [360]:
dtm2.shape

(13296, 482)

In [361]:
dtm2_df = pd.DataFrame(dtm2.todense(), columns=tfidf.get_feature_names())
print('Shape: ', dtm2_df.shape)
dtm2_df.head(2)

Shape:  (13296, 482)


Unnamed: 0,afton,agoura,agua,aiea,albany,alhambra,...,woodland,woodside,worthington,wrightwood,york,zimmerman
0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0


In [362]:
X_train_new = X_train.reset_index()
X_train_new.drop(columns=['index'], inplace=True)

In [363]:
X_train_new = pd.concat([X_train_new, dtm1_df], axis=1)

In [364]:
X_test_new = X_test.reset_index()
X_test_new.drop(columns=['index'], inplace=True)
X_test_new = pd.concat([X_test_new, dtm2_df], axis=1)

In [365]:
X_train_new.drop(columns=['city', 'room_type', 'text_combined'], inplace=True)
X_test_new.drop(columns=['city', 'room_type', 'text_combined'], inplace=True)

In [366]:
def prep_data(data):
    dtm = tfidf.transform(data)
    dtm_df = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
    return dtm_df

In [367]:
X_train.head()

Unnamed: 0,city,room_type,security_deposit,guests_included,minimum_nights,text_combined
103943,ROWLAND HEIGHTS,Private room,150.0,1,1,ROWLAND HEIGHTS Private room
18134,NEW ORLEANS,Entire home/apt,0.0,1,30,NEW ORLEANS Entire home/apt
75745,AUSTIN,Private room,0.0,1,1,AUSTIN Private room
131277,CHICAGO,Entire home/apt,500.0,8,3,CHICAGO Entire home/apt
69170,HALEIWA,Entire home/apt,0.0,15,1,HALEIWA Entire home/apt


In [368]:
X_test.head()

Unnamed: 0,city,room_type,security_deposit,guests_included,minimum_nights,text_combined
34319,AUSTIN,Entire home/apt,100.0,1,2,AUSTIN Entire home/apt
96309,LOS ANGELES,Private room,0.0,1,356,LOS ANGELES Private room
89301,NEW ORLEANS,Entire home/apt,200.0,2,3,NEW ORLEANS Entire home/apt
40589,BROOKLYN,Private room,0.0,1,2,BROOKLYN Private room
17355,NEW YORK,Entire home/apt,100.0,2,30,NEW YORK Entire home/apt


In [369]:
import numpy as np
import os
import datetime
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

In [370]:
# # scale and normalize data

# from sklearn.preprocessing import MinMaxScaler, Normalizer

# scaler = MinMaxScaler()
# normalize = Normalizer()

# scaled_X_train = scaler.fit_transform(X_train_new)
# scaled_X_test = scaler.fit_transform(X_test_new)

# normalized_X_train = normalize.fit_transform(scaled_X_train)
# normalized_X_test = normalize.transform(scaled_X_test)

In [373]:
X_train_new.shape

(53184, 484)

In [379]:
model = Sequential([
    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.1),
    Dense(1)
])

optimizer = tf.keras.optimizers.RMSprop(0.001)

epochs = 100

stop = EarlyStopping(monitor='val_mae', min_delta=0.01, patience=3)

model.compile(loss='mse', optimizer=optimizer, metrics=['mse','mae'])

history = model.fit(X_train_new.values, y_train.values, epochs=epochs, 
                    validation_data=(X_test_new.values, y_test.values),
                    verbose=1,
                    callbacks=[stop])

Train on 53184 samples, validate on 13296 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [380]:
predictions = model.predict(X_test_new)
predictions

array([[163.08533 ],
       [ 89.020935],
       [193.17088 ],
       ...,
       [ 91.17809 ],
       [298.19177 ],
       [277.79987 ]], dtype=float32)

In [381]:
y_test

Unnamed: 0,price
34319,62.0
96309,155.0
89301,179.0
40589,90.0
17355,83.0
...,...
42861,40.0
24689,95.0
16697,68.0
46056,223.0


In [384]:
def vectorize_data(data):
    
    dtm = tfidf.transform(data)
    dtm_df = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
    return dtm_df

In [391]:
def predict(city='United States', room_type='any', security_deposit=0.0, guests_included=1, min_nights=1):

    # Make dataframe from the inputs
    df = pd.DataFrame(
        data=[[city, room_type, security_deposit, guests_included, min_nights]], 
        columns=['city', 'room_type', 'security_deposit', 'guests_included', 'min_nights']
    )

    df["text_combined"] = df[['city', 'room_type']].apply(' '.join, axis=1)
    
    matrix = vectorize_data(df["text_combined"])

    df = pd.concat([df, matrix], axis=1)    
    df = df.drop(columns=['city', 'room_type', 'text_combined'])

    # Get the model's prediction
    pred = model.predict(df.values)[0][0]
    
    return f'city: {city}, room_type: {room_type}, security_deposit: {security_deposit}, guests_included: {guests_included}, min_nights: {min_nights}, price: ${pred:.2f}'

#     return jsonify({"city": city, 
#                     "room_type": room_type, 
#                     "security_deposit": security_deposit, 
#                     "guests_included": guests_included, 
#                     "min_nights": min_nights,
#                     "price": results})

In [392]:
predict('Boston', 'Private room', min_nights=5)

'city: Boston, room_type: Private room, security_deposit: 0.0, guests_included: 1, min_nights: 5, price: $89.34'

In [394]:
model.save('light_model.h5')