# Air BnB build week project

In [0]:
# mounting google drive to the notebook

from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


# Berlin

## Data cleaning and exploration

In [0]:
# Read in the data

import pandas as pd
df = pd.read_csv('/content/drive/My Drive/AirBnB/Berlin_listings_summary.csv', engine='python', encoding='utf-8', error_bad_lines=False)

In [0]:
# List of the feature names and shape of the data

df.shape
list(df)

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',


In [0]:
# List of columns to keep

columns = [
    'id',
    'neighbourhood_group_cleansed',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'bed_type',
    'amenities',
    'price',
    'security_deposit',
    'cleaning_fee',
    'guests_included',
    'extra_people',
    'minimum_nights',
    'instant_bookable',
    'is_business_travel_ready',
    'cancellation_policy',    
]

In [0]:
# Dropping unwanted columns

df = df[columns]
df.shape

(22552, 18)

In [0]:
# Checking for Null values
df.isna().sum()

id                                 0
neighbourhood_group_cleansed       0
property_type                      0
room_type                          0
accommodates                       0
bathrooms                         32
bedrooms                          18
bed_type                           0
amenities                          0
price                              0
security_deposit                9361
cleaning_fee                    7146
guests_included                    0
extra_people                       0
minimum_nights                     0
instant_bookable                   0
is_business_travel_ready           0
cancellation_policy                0
dtype: int64

In [0]:
# NaN values in these features are most likely to be $0.00

df['security_deposit'] = df['security_deposit'].fillna('$0.00')
df['cleaning_fee'] = df['cleaning_fee'].fillna('$0.00')
df.isna().sum()

id                               0
neighbourhood_group_cleansed     0
property_type                    0
room_type                        0
accommodates                     0
bathrooms                       32
bedrooms                        18
bed_type                         0
amenities                        0
price                            0
security_deposit                 0
cleaning_fee                     0
guests_included                  0
extra_people                     0
minimum_nights                   0
instant_bookable                 0
is_business_travel_ready         0
cancellation_policy              0
dtype: int64

In [0]:
# Dropping the rows with the NaN values
df = df.dropna()
df.isna().sum()

id                              0
neighbourhood_group_cleansed    0
property_type                   0
room_type                       0
accommodates                    0
bathrooms                       0
bedrooms                        0
bed_type                        0
amenities                       0
price                           0
security_deposit                0
cleaning_fee                    0
guests_included                 0
extra_people                    0
minimum_nights                  0
instant_bookable                0
is_business_travel_ready        0
cancellation_policy             0
dtype: int64

In [0]:
df.dtypes

id                                int64
neighbourhood_group_cleansed     object
property_type                    object
room_type                        object
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
bed_type                         object
amenities                        object
price                            object
security_deposit                 object
cleaning_fee                     object
guests_included                   int64
extra_people                     object
minimum_nights                    int64
instant_bookable                 object
is_business_travel_ready         object
cancellation_policy              object
dtype: object

In [0]:
df['security_deposit']

0          $200.00
1            $0.00
2          $200.00
3          $250.00
4            $0.00
5          $400.00
6          $500.00
7            $0.00
8          $500.00
9            $0.00
11         $500.00
12         $500.00
13         $300.00
14         $150.00
15         $500.00
16         $200.00
17         $200.00
18         $250.00
19       $1,660.00
20         $200.00
22       $1,500.00
23           $0.00
24         $100.00
25           $0.00
26       $1,000.00
27         $500.00
28           $0.00
29         $100.00
30           $0.00
31         $500.00
           ...    
22522        $0.00
22523      $150.00
22524      $200.00
22525        $0.00
22526        $0.00
22527        $0.00
22528        $0.00
22529        $0.00
22530        $0.00
22531        $0.00
22532    $2,000.00
22533        $0.00
22534        $0.00
22535        $0.00
22536        $0.00
22537        $0.00
22538        $0.00
22539        $0.00
22540        $0.00
22541        $0.00
22542        $0.00
22543       

In [0]:
# Changing features from objects to floats

df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
df['security_deposit'] = df['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)
df['cleaning_fee'] = df['cleaning_fee'].str.replace('$', '').str.replace(',', '').astype(float)
df['extra_people'] = df['extra_people'].str.replace('$', '').str.replace(',', '').astype(float)

In [0]:
df.dtypes

id                                int64
neighbourhood_group_cleansed     object
property_type                    object
room_type                        object
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
bed_type                         object
amenities                        object
price                           float64
security_deposit                float64
cleaning_fee                    float64
guests_included                   int64
extra_people                    float64
minimum_nights                    int64
instant_bookable                 object
is_business_travel_ready         object
cancellation_policy              object
dtype: object

In [0]:
df.head()

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy
0,2015,Mitte,Guesthouse,Entire home/apt,3,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",60.0,200.0,30.0,1,28.0,4,f,f,strict_14_with_grace_period
1,2695,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,"{Wifi,Kitchen,Elevator,Heating,Washer,Essentia...",17.0,0.0,0.0,1,0.0,2,f,f,flexible
2,3176,Pankow,Apartment,Entire home/apt,4,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",90.0,200.0,50.0,2,20.0,62,t,f,strict_14_with_grace_period
3,3309,Tempelhof - Schöneberg,Apartment,Private room,2,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets allowed"",""Pets live on th...",26.0,250.0,30.0,1,18.0,5,f,f,strict_14_with_grace_period
4,7071,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,"{Wifi,Heating,""Family/kid friendly"",Essentials...",42.0,0.0,0.0,1,24.0,2,f,f,moderate


In [0]:
#  Making true/false values from strings

df = df.replace({'instant_bookable': {'t': True, 'f': False}, 'is_business_travel_ready': {'t':True, 'f':False}})
df.head(4)

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy
0,2015,Mitte,Guesthouse,Entire home/apt,3,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",60.0,200.0,30.0,1,28.0,4,False,False,strict_14_with_grace_period
1,2695,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,"{Wifi,Kitchen,Elevator,Heating,Washer,Essentia...",17.0,0.0,0.0,1,0.0,2,False,False,flexible
2,3176,Pankow,Apartment,Entire home/apt,4,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",90.0,200.0,50.0,2,20.0,62,True,False,strict_14_with_grace_period
3,3309,Tempelhof - Schöneberg,Apartment,Private room,2,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets allowed"",""Pets live on th...",26.0,250.0,30.0,1,18.0,5,False,False,strict_14_with_grace_period


In [0]:
df.dtypes

id                                int64
neighbourhood_group_cleansed     object
property_type                    object
room_type                        object
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
bed_type                         object
amenities                        object
price                           float64
security_deposit                float64
cleaning_fee                    float64
guests_included                   int64
extra_people                    float64
minimum_nights                    int64
instant_bookable                   bool
is_business_travel_ready           bool
cancellation_policy              object
dtype: object

In [0]:
df.head()

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy
0,2015,Mitte,Guesthouse,Entire home/apt,3,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",60.0,200.0,30.0,1,28.0,4,False,False,strict_14_with_grace_period
1,2695,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,"{Wifi,Kitchen,Elevator,Heating,Washer,Essentia...",17.0,0.0,0.0,1,0.0,2,False,False,flexible
2,3176,Pankow,Apartment,Entire home/apt,4,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",90.0,200.0,50.0,2,20.0,62,True,False,strict_14_with_grace_period
3,3309,Tempelhof - Schöneberg,Apartment,Private room,2,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets allowed"",""Pets live on th...",26.0,250.0,30.0,1,18.0,5,False,False,strict_14_with_grace_period
4,7071,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,"{Wifi,Heating,""Family/kid friendly"",Essentials...",42.0,0.0,0.0,1,24.0,2,False,False,moderate


In [0]:
df['amenities'] = df['amenities'].apply(lambda x: x.strip('{'))
df['amenities'] = df['amenities'].apply(lambda x: x.strip('}'))
df['amenities'] = df['amenities'].apply(lambda x: x.upper())


In [0]:
df['tv'] = df['amenities'].str.contains('TV') | df['amenities'].str.contains('CABLE')
df['internet'] = df['amenities'].str.contains('WIFI') | df['amenities'].str.contains('INTERNET')
df['pets'] = df['amenities'].str.contains('CAT') | df['amenities'].str.contains('DOG') | df['amenities'].str.contains('PET')

In [0]:
df = df.drop('amenities', axis =1)

In [0]:
df.head()

Unnamed: 0,id,neighbourhood_group_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,bed_type,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,tv,internet,pets
0,2015,Mitte,Guesthouse,Entire home/apt,3,1.0,1.0,Real Bed,60.0,200.0,30.0,1,28.0,4,False,False,strict_14_with_grace_period,True,True,False
1,2695,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,17.0,0.0,0.0,1,0.0,2,False,False,flexible,False,True,False
2,3176,Pankow,Apartment,Entire home/apt,4,1.0,1.0,Real Bed,90.0,200.0,50.0,2,20.0,62,True,False,strict_14_with_grace_period,False,True,False
3,3309,Tempelhof - Schöneberg,Apartment,Private room,2,1.0,1.0,Pull-out Sofa,26.0,250.0,30.0,1,18.0,5,False,False,strict_14_with_grace_period,False,True,True
4,7071,Pankow,Apartment,Private room,2,1.0,1.0,Real Bed,42.0,0.0,0.0,1,24.0,2,False,False,moderate,False,True,False


In [0]:
#  Split into X and y

X = df.drop('price', axis=1)
y = df['price']

In [0]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

In [0]:
X_encoded

array([[0.0000e+00, 4.0000e+00, 1.6000e+01, ..., 1.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.0000e+00, 6.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.0000e+00, 6.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [2.2499e+04, 6.0000e+00, 2.3000e+01, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.2500e+04, 4.0000e+00, 5.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.2501e+04, 5.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00]])

## Neural Network modeling