# Air BnB build week project

# Berlin

## Data cleaning and exploration

In [0]:
pip install category_encoders



In [0]:
# Read in the data

import pandas as pd
df = pd.read_csv('/content/listings.csv', engine='python', encoding='utf-8', error_bad_lines=False)

In [0]:
# List of the feature names and shape of the data

df.shape
list(df)

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',


In [0]:
# List of columns to keep

columns = [
    'id',
    'neighbourhood_group_cleansed',
    'property_type',
    'room_type',
    'description',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'bed_type',
    'amenities',
    'price',
    'security_deposit',
    'cleaning_fee',
    'guests_included',
    'extra_people',
    'minimum_nights',
    'instant_bookable',
    'is_business_travel_ready',
    'cancellation_policy',    
]

In [0]:
# Dropping unwanted columns

df = df[columns]
df.shape

(1442, 19)

In [0]:
# Checking for Null values
df.isna().sum()

id                                0
neighbourhood_group_cleansed      1
property_type                     1
room_type                         1
description                       1
accommodates                      1
bathrooms                        29
bedrooms                          4
bed_type                          1
amenities                         1
price                             1
security_deposit                361
cleaning_fee                    202
guests_included                   1
extra_people                      1
minimum_nights                    1
instant_bookable                  1
is_business_travel_ready          1
cancellation_policy               1
dtype: int64

In [0]:
# NaN values in these features are most likely to be $0.00

df['security_deposit'] = df['security_deposit'].fillna('$0.00')
df['cleaning_fee'] = df['cleaning_fee'].fillna('$0.00')
df.isna().sum()

id                               0
neighbourhood_group_cleansed     1
property_type                    1
room_type                        1
description                      1
accommodates                     1
bathrooms                       29
bedrooms                         4
bed_type                         1
amenities                        1
price                            1
security_deposit                 0
cleaning_fee                     0
guests_included                  1
extra_people                     1
minimum_nights                   1
instant_bookable                 1
is_business_travel_ready         1
cancellation_policy              1
dtype: int64

In [0]:
# Dropping the rows with the NaN values
df = df.dropna()
df.isna().sum()

id                              0
neighbourhood_group_cleansed    0
property_type                   0
room_type                       0
description                     0
accommodates                    0
bathrooms                       0
bedrooms                        0
bed_type                        0
amenities                       0
price                           0
security_deposit                0
cleaning_fee                    0
guests_included                 0
extra_people                    0
minimum_nights                  0
instant_bookable                0
is_business_travel_ready        0
cancellation_policy             0
dtype: int64

In [0]:
df.dtypes

id                                int64
neighbourhood_group_cleansed     object
property_type                    object
room_type                        object
description                      object
accommodates                    float64
bathrooms                       float64
bedrooms                        float64
bed_type                         object
amenities                        object
price                            object
security_deposit                 object
cleaning_fee                     object
guests_included                 float64
extra_people                     object
minimum_nights                  float64
instant_bookable                 object
is_business_travel_ready         object
cancellation_policy              object
dtype: object

In [0]:
# Changing features from objects to floats

df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
df['security_deposit'] = df['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)
df['cleaning_fee'] = df['cleaning_fee'].str.replace('$', '').str.replace(',', '').astype(float)
df['extra_people'] = df['extra_people'].str.replace('$', '').str.replace(',', '').astype(float)

In [0]:
df.isna().sum()

id                              0
neighbourhood_group_cleansed    0
property_type                   0
room_type                       0
description                     0
accommodates                    0
bathrooms                       0
bedrooms                        0
bed_type                        0
amenities                       0
price                           0
security_deposit                0
cleaning_fee                    0
guests_included                 0
extra_people                    0
minimum_nights                  0
instant_bookable                0
is_business_travel_ready        0
cancellation_policy             0
dtype: int64

In [0]:
df.dtypes

id                                int64
neighbourhood_group_cleansed     object
property_type                    object
room_type                        object
description                      object
accommodates                    float64
bathrooms                       float64
bedrooms                        float64
bed_type                         object
amenities                        object
price                           float64
security_deposit                float64
cleaning_fee                    float64
guests_included                 float64
extra_people                    float64
minimum_nights                  float64
instant_bookable                 object
is_business_travel_ready         object
cancellation_policy              object
dtype: object

In [0]:
# making alterations to a copy to verify correct code before altering the DF


df = df.replace({'instant_bookable': {'t': True, 'f': False}, 'is_business_travel_ready': {'t':True, 'f':False}})
df.head(4)

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,description,accommodates,bathrooms,bedrooms,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy
0,1944,Mitte,Apartment,Private room,"Private, bright and friendly room. You'd be sh...",1.0,1.0,1.0,Real Bed,"{""Cable TV"",Internet,Wifi,""Free street parking...",21.0,0.0,0.0,1.0,10.0,120.0,False,False,moderate
1,2015,Mitte,Guesthouse,Entire home/apt,Great location! 30 of 75 sq meters. This wood...,3.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",60.0,250.0,30.0,1.0,28.0,4.0,False,False,moderate
2,3176,Pankow,Apartment,Entire home/apt,This beautiful first floor apartment is situa...,4.0,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",90.0,200.0,50.0,2.0,20.0,62.0,False,False,strict_14_with_grace_period
3,3309,Tempelhof - Schöneberg,Apartment,Private room,First of all: I prefer short-notice bookings. ...,1.0,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets live on this property"",Ca...",28.0,250.0,30.0,1.0,18.0,7.0,False,False,strict_14_with_grace_period


In [0]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

STOPWORDS = set(STOPWORDS).union(set(['and']))

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [0]:
df.head()

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,description,accommodates,bathrooms,bedrooms,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy
0,1944,Mitte,Apartment,Private room,"Private, bright and friendly room. You'd be sh...",1.0,1.0,1.0,Real Bed,"{""Cable TV"",Internet,Wifi,""Free street parking...",21.0,0.0,0.0,1.0,10.0,120.0,False,False,moderate
1,2015,Mitte,Guesthouse,Entire home/apt,Great location! 30 of 75 sq meters. This wood...,3.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",60.0,250.0,30.0,1.0,28.0,4.0,False,False,moderate
2,3176,Pankow,Apartment,Entire home/apt,This beautiful first floor apartment is situa...,4.0,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",90.0,200.0,50.0,2.0,20.0,62.0,False,False,strict_14_with_grace_period
3,3309,Tempelhof - Schöneberg,Apartment,Private room,First of all: I prefer short-notice bookings. ...,1.0,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets live on this property"",Ca...",28.0,250.0,30.0,1.0,18.0,7.0,False,False,strict_14_with_grace_period
4,6883,Friedrichshain-Kreuzberg,Loft,Entire home/apt,Stay in a stylish loft on the second floor and...,2.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",125.0,0.0,39.0,1.0,0.0,3.0,False,False,moderate


In [0]:
df['description'] = df['description'].apply(tokenize)

In [0]:
df['description_length'] = df['description'].apply(lambda x: len(x))
df = df.drop('description', axis=1)

In [0]:
df['amenities'] = df['amenities'].apply(lambda x: x.strip('{'))
df['amenities'] = df['amenities'].apply(lambda x: x.strip('}'))
df['amenities'] = df['amenities'].apply(lambda x: x.upper())

In [0]:
df['tv'] = df['amenities'].str.contains('TV') | df['amenities'].str.contains('CABLE')
df['internet'] = df['amenities'].str.contains('WIFI') | df['amenities'].str.contains('INTERNET')
df['pets'] = df['amenities'].str.contains('CAT') | df['amenities'].str.contains('DOG') | df['amenities'].str.contains('PET')

In [0]:
df = df.drop('amenities', axis =1)

In [0]:
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols = ['neighbourhood_group_cleansed', 'property_type',
                                    'instant_bookable', 'cancellation_policy',
                                    'tv', 'pets', 'bathrooms'])

X = df.drop(['price', 'id', 'room_type', 'is_business_travel_ready',
             'internet', 'bed_type'], axis=1)
y = df['price']

X_encoded = encoder.fit_transform(X)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [0]:
X_train

Unnamed: 0,neighbourhood_group_cleansed,property_type,accommodates,bathrooms,bedrooms,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,cancellation_policy,description_length,tv,pets
447,3,6,2.0,1,1.0,0.0,25.0,1.0,10.0,3.0,1,1,82,1,1
609,2,1,2.0,5,1.0,0.0,15.0,1.0,0.0,1.0,1,3,95,1,2
1293,5,1,2.0,1,1.0,0.0,0.0,1.0,0.0,1.0,1,3,8,2,1
1175,6,1,2.0,1,1.0,0.0,0.0,1.0,5.0,3.0,2,3,50,2,1
986,1,1,4.0,1,2.0,300.0,59.0,2.0,15.0,2.0,1,2,100,2,1
1097,10,1,5.0,1,2.0,400.0,0.0,4.0,9.0,25.0,1,1,92,1,1
357,2,1,2.0,1,1.0,250.0,15.0,1.0,7.0,2.0,1,1,51,1,1
1154,4,1,1.0,1,1.0,100.0,0.0,1.0,0.0,1.0,1,1,34,2,1
78,2,1,4.0,1,1.0,0.0,40.0,2.0,15.0,6.0,1,2,88,2,1
1053,4,1,2.0,1,1.0,0.0,20.0,2.0,20.0,7.0,2,2,94,2,1


In [0]:
import pickle
filename = 'encoder.pkl'
pickle.dump(encoder, open(filename, 'wb'))

In [0]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
inputs = X_train.shape[1]

In [0]:
# Create Model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(inputs,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1))

In [0]:
# Compile Model
model.compile(optimizer='adagrad', loss='mae', metrics=['mae'])

# Fit Model

model.fit(X_train, y_train, 
          validation_split=0.33,
          epochs=20
         )

In [0]:
from sklearn.metrics import mean_absolute_error

preds = model.predict(X_test)
mean_absolute_error(preds, y_test)

In [0]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)

preds = linear.predict(X_test)
mean_absolute_error(preds, y_test)

In [0]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=47)
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)
mean_absolute_error(preds, y_test)

In [0]:
rfc.feature_importances_

In [0]:
X_train.columns

In [0]:
import pickle
filename = 'finalized_model.pkl'
pickle.dump(rfc, open(filename, 'wb'))

In [0]:
df.head()

In [0]:
X_train.head()

Unnamed: 0,neighbourhood_group_cleansed,property_type,accommodates,bathrooms,bedrooms,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,cancellation_policy,description_length,tv,pets
447,3,6,2.0,1,1.0,0.0,25.0,1.0,10.0,3.0,1,1,82,1,1
609,2,1,2.0,5,1.0,0.0,15.0,1.0,0.0,1.0,1,3,95,1,2
1293,5,1,2.0,1,1.0,0.0,0.0,1.0,0.0,1.0,1,3,8,2,1
1175,6,1,2.0,1,1.0,0.0,0.0,1.0,5.0,3.0,2,3,50,2,1
986,1,1,4.0,1,2.0,300.0,59.0,2.0,15.0,2.0,1,2,100,2,1
