In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#import spacy
#nlp = spacy.load("en_core_web_md")

In [2]:
# Load data into X and y
X = pd.read_csv('Airbnb.csv')
y = pd.read_csv('Airbnb_target.csv')
print(f"Does x.shape and y.shape match?\n{X.shape[0] == y.shape[0]}: X = {X.shape}, y = {y.shape}")

Does x.shape and y.shape match?
True: X = (299626, 8), y = (299626, 1)


In [3]:
# load the descrtiption into a text dataframe
text = X['description']
X.drop(columns = ['description','max_nights'],inplace = True)

In [4]:
# check to make sure all the data looks correct
X.head()

Unnamed: 0,room_type,neighborhood,min_nights,space_for,bathrooms,bedrooms
0,15,3,5,2,1.0,1.0
1,15,6,1,3,1.0,0.0
2,14,2,3,2,1.0,0.0
3,15,2,1,4,1.0,1.0
4,15,1,1,2,1.0,0.0


In [7]:
X.describe()

Unnamed: 0,room_type,neighborhood,min_nights,space_for,bathrooms,bedrooms
count,299626.0,299626.0,299626.0,299626.0,299626.0,299626.0
mean,14.78348,3.168016,5.34752,3.064804,1.095623,1.211921
std,2.423156,2.222517,23.226433,1.876599,0.321844,0.698368
min,14.0,1.0,1.0,1.0,0.0,0.0
25%,14.0,1.0,1.0,2.0,1.0,1.0
50%,15.0,3.0,2.0,2.0,1.0,1.0
75%,15.0,4.0,3.0,4.0,1.0,1.0
max,36.0,12.0,5000.0,16.0,8.5,12.0


In [8]:
text.head()

0    Apartment in charming Prenzlauer Berg, on the ...
1    Das Einzimmer-Studio-Appartement Kalckreuthstr...
2    The flat is located in the city center in the ...
3    What makes this listing extra special is there...
4    Modern furnished Loft in the heart of Berlin M...
Name: description, dtype: object

In [9]:
X = X.values
X[:2]

array([[15.,  3.,  5.,  2.,  1.,  1.],
       [15.,  6.,  1.,  3.,  1.,  0.]])

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print(X)

[[ 0.08935465 -0.07559737 -0.01496228 -0.56741255 -0.2971084  -0.30345187]
 [ 0.08935465  1.27422605 -0.18718015 -0.0345328  -0.2971084  -1.73536312]
 [-0.32333091 -0.5255385  -0.10107122 -0.56741255 -0.2971084  -1.73536312]
 ...
 [-0.32333091 -0.97547964 -0.14412568 -1.10029229 -0.2971084  -0.30345187]
 [-0.32333091 -0.07559737 -0.10107122 -0.56741255  2.80998743 -0.30345187]
 [-0.32333091  2.62404946 -0.18718015 -0.56741255 -0.2971084  -0.30345187]]


In [11]:
y = y.values
len(y)

299626

In [13]:
y = y / 100
y[8]

array([0.35])

In [30]:
# MLP architecture
def create_model():
    model = Sequential()
    model.add(Dense(8,input_shape = (X.shape[1],),activation = 'relu'))
    model.add(Dense(6,activation = 'sigmoid'))
    model.add(Dense(4,activation = 'sigmoid'))
    model.add(Dense(1,activation = 'softmax'))
    model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mse','accuracy'])
    
    return model
model = create_model()
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_32 (Dense)             (None, 8)                 56        
_________________________________________________________________
dense_33 (Dense)             (None, 6)                 54        
_________________________________________________________________
dense_34 (Dense)             (None, 4)                 28        
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 5         
Total params: 143
Trainable params: 143
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.fit(X,y,epochs = 10, batch_size = 32,verbose = True)

Train on 299626 samples
Epoch 1/10

KeyboardInterrupt: 

In [54]:
model.evaluate(X,y)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[0.3081026564618538, 0.30810276, 0.016337033]

In [None]:
# hypertuning parameters
param_grid = {'batch_size': [5,10, 20, 40, 60, 80, 100],
              'epochs': [20,40,80]}

model = KerasClassifier(build_fn=create_model, verbose=True)

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=6,scoring = 'accuracy')
grid_result = grid.fit(X, y)

print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")