# Table of Contents


*   Imports & Data Upload
    - Imports
        * Libraries
        * Pandas Profiling
        * Spacy
        * Joblib
    - Data Upload
*   Functions
*   Data Wrangle & Exploration
    - Data Wrangle
    - Pandas Profiling
*   Data Visualizations
*   Tokenize and Vectorize 'House Rules'
*   Model Building
    - Manual Encoding
    - Train Test Split
    - Nueral Network Models
    - Save Best Model



# Imports & Data Upload


## Imports

In [840]:
# General
import numpy as np
import pandas as pd
import re
from datetime import datetime
import math

# Plotting
import plotly.express as px

# Model Building
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import sklearn.metrics

# Deep Learning
import tensorflow as tf
from keras import models, layers, optimizers
from tensorflow.keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping

# Other
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Memory

### Pandas Profiling

In [None]:
# Pandas Profiling
# #### Uncomment to Reinstall #### #
# !pip install pandas-profiling

In [None]:
from pandas_profiling import ProfileReport

### Spacy

In [None]:
# #### Uncomment to Reinstall #### #
# !python -m spacy download en_core_web_md

In [None]:
import spacy

### Joblib

In [877]:
!pip install joblib



In [878]:
import joblib

## Data Upload

In [None]:
# Air BnB London Upload
# Source: https://public.opendatasoft.com/explore/dataset/airbnb-listings/table/?disjunctive.host_verifications&disjunctive.amenities&disjunctive.features&refine.city=London
# #### Uncomment to Upload #### #
# from google.colab import files
# upload = files.upload()

# Functions

##Data Wrangling and Cleaning

In [None]:
def clean_string_list(text):
  '''
  Takes input text (string) which represents a list, returns 
  lowercase text with spaces after commas, and removes 
  leading and trailing whitespace

  ----
  Requirements: String
  '''

  # Ensure Input is a String
  text = str(text)

  # Remove Leading and Trailing Spaces
  text = text.strip()
  text = text.replace(',', ', ')

  return text.lower()

In [None]:
def string_to_list(text, delimiter=" "):
  '''
  Takes input text (string) which represents a list, and returns a list
  object of that text. Splits occur along specified delimiter
  ----
  
  Requirements: 
  - String
  - Delimiter; examples: ";", ",", "-"

  '''

  # Ensure Input is a String
  text = str(text)

  # Split
  li = list(text.split(delimiter))

  return li

In [None]:
def unique_list(my_list):
  '''
  Takes a list, and returns a new list of unique entries sorted alphabetically
  ----
  
  requirements:
  - List Object
  '''

  # Create New Empty List
  unique_list = set(my_list)
  unique_list = list(unique_list)
  
  return sorted(unique_list)

In [None]:
def list_to_columns(my_list):
  '''
  Takes a list and adds them to a dataframe
  ----

  Requirements:
  List Object
  '''

  for x in my_list:
    x = str(x)
    df[x] = ""

In [None]:
def string_to_datetime(date_time_string):
  'Takes a string, and converts to DateTime Object'

  # Check that value is not 0 nor NaN
  if date_time_string != None:

    # Ensure entry is string
    dt_str = str(date_time_string)
    # Convert
    dt_obj = datetime.strptime(dt_str, '%Y-%m-%d').date()

  return dt_obj

In [None]:
def string_to_numeric(text, numeric_type):
  'Takes a string, and returns a float, or int as specified'

  if numeric_type == 'integer':
    numbers = int(text)
    return numbers
  elif numeric_type == 'decimal':
    numbers = float(text)
    return numbers
  else:
    print("Please enter 'integer' for int, and 'decimal' for float.")

In [None]:
def string_to_int(text):
  'Takes a string, and returns an int'

  numbers = int(text)
  return numbers

## Tokenizing

In [None]:
def clean_text(text):
  'Returns Cleaned Text from Columns'

  # Ensure text is string
  text = str(text)
  
  # Regex
  punct_regex = r'[^0-9a-zA-Z\s]'
  spec_chars_regex = r'[\$\%\&\@\"\:+]'
  whitespace_regex = r"\s+\s"

  # Apply Regex
  text = re.sub(punct_regex, "  ", text)
  text = re.sub(spec_chars_regex, "  ", text)
  text = re.sub(whitespace_regex, " ", text)

  return text.lower()

In [None]:
def tokenize(doc):
  'Returns a list of parsed tokens'

  # List of Tokens
  tokens = []

  # Filter Stopwords, Punctuation, and White Space
  for token in nlp(doc):
    if (token.is_stop == False) & (token.is_punct == False) & (token.text.isspace() == False):
      tokens.append(token.text.lower())

  return tokens

# Data Wrangle & Exploration

## Wrangle

In [None]:
def wrangle(filepath):
  '''
  This function creates, and wrangles the data for use
  ----

  Requirements: Filepath to AirBnB Listings London; See above for download link
  '''

  # Create DataFrame
  df = pd.read_csv(filepath, sep=';')

  # Clean Up Columns
  # Drop Unnecessary/Redunant Columns
  cols_to_keep = [
        'ID', 'House Rules', 'Host Since', 'Host Response Time',
        'Host Response Rate', 'Host Acceptance Rate', 'Host Listings Count',
        'Neighbourhood Cleansed', 'Latitude', 'Longitude', 'Property Type', 
        'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds', 
        'Bed Type', 'Amenities', 'Square Feet', 'Price',
        'Security Deposit', 'Cleaning Fee', 'Guests Included', 'Extra People', 
        'Minimum Nights', 'Maximum Nights', 'Availability 30', 
        'Availability 60', 'Availability 90', 'Number of Reviews', 
        'Review Scores Rating', 'Review Scores Accuracy', 
        'Review Scores Cleanliness', 'Review Scores Checkin', 
        'Review Scores Communication', 'Review Scores Location', 
        'Review Scores Value', 'Cancellation Policy', 'Reviews per Month',
        ]

  df = df[cols_to_keep]
  # Rename Columns for Clarity
  df.rename(columns={
      'Neighbourhood Cleansed': 'Neighbourhood',
      'Property Type': 'Prop Type',
      'Square Feet': 'Square Ft', 
      'Guests Included': 'Guests Inc',
      'Extra People': 'XT People',
      'Minimum Nights': 'Min Nights',
      'Maximum Nights': 'Max Nights',
      'Availability 30': 'Avail 30',
      'Availability 60': 'Avail 60',
      'Availability 90': 'Avail 90',
      'Number of Reviews': 'Num Reviews',
      }, inplace=True)

  # Clean Up Column Names
  df.columns = df.columns.str.lower()
  df.columns = df.columns.str.strip()
  df.columns = df.columns.str.replace(' ', '_')

  # Fill House Rules NaNs with 'none' to avoid drop
  df['house_rules'] = df['house_rules'].fillna('none')

  # Drop Columns & Rows with Nulls Greater than 30% of total rows
  null_thresh = int(0.7*df.shape[0])
  df.dropna(axis=1, thresh=null_thresh, inplace=True)
  
  # Clean Up Column Data

  # Host Response Time 
  df['host_response_time'] = df['host_response_time'].replace({
      'within an hour': int(1),
      'within a few hours': int(1),
      'within a day': int(1),
      'a few days': int(0),
      'a few days or more': int(0),
      })
  df = df.rename(columns={'host_response_time': 'host_same_day_response'})

  # Neighborhood Clean Up
  # Anything with less than 100 listings changed to other
  df['neighbourhood'] = df['neighbourhood'].replace({
      'Redbridge': 'Other',
      'Bexley': 'Other',
      'Harrow': 'Other',
      'Barking and Dagenham': 'Other',
      'Kingston upon Thames': 'Other',
      'Hillingdon': 'Other',
      'Sutton': 'Other',
      'Havering': 'Other',
      })

  # Property Type Clean Up
  # Remove Non-Housing Options
  df = df[df.prop_type != 'Parking Space']
  # Reduce Options
  df['prop_type'] = df['prop_type'].replace({
      'Townhouse': 'House',
      'Loft': 'Apartment',
      'Dorm': 'Other',
      'Guesthouse': 'House',
      'Serviced apartment': 'Apartment',
      'Condominium': 'House',
      'Cabin': 'House',
      'Hostel': 'Hotel',
      'Bungalow': 'House',
      'Boutique hotel': 'Hotel',
      'Villa': 'House',
      'Camper/RV': 'Camping',
      'Castle': 'Other',
      'Yurt': 'Camping',
      'Chalet': 'House',
      'Hut': 'Camping',
      'Tent': 'Camping',
      'Ryokan (Japan)': 'Hotel',
      'Cave': 'Other',
      })
  
  # Room Type Clean Up
  # Change Options for Clarity
  df['room_type'] = df['room_type'].replace({
      'Entire home/apt': 'Entire Location',
      })
  
  # Cancellation Policy Clean Up
  df['cancellation_policy'] = df['cancellation_policy'].replace({
      'super_strict_30': 'strict'
      })
  
  # # Feature Engineering

  # Amenities Expanded
  # Clean Amenities Text
  df['amenities'] = df['amenities'].apply(clean_string_list)
  # Create Columns for Each Unique Ammentity Type
  amenities_list = [
                    '24_hr_checkin', 'air_conditioning', 'breakfast', 'cable', 
                    'cats_allowed', 'dogs_allowed','doorman', 'dryer', 'elevator', 
                    'family/kid_friendly', 'free_parking', 'gym', 'heating', 
                    'hot_tub', 'fireplace', 'internet', 'keypad', 'kitchen', 
                    'laptop_friendly_workspace', 'lock_on_bedroom_door', 'lockbox',
                    'pets_allowed', 'pool', 'private_entrance', 
                    'private_living_room', 'safety_card', 'self_checkin', 
                    'smartlock', 'smoking_allowed', 'events?', 'tv', 'washer',
                    'wheelchair_access'
                    ]
  list_to_columns(amenities_list)

  # Update Column Values to Boolean Yes (1), No (0) based on amenities column
  df.loc[((df.amenities.str.contains('24-hour')) == True), '24_hr_checkin'] = 1
  df.loc[((df.amenities.str.contains('24-hour')) == False), '24_hr_checkin'] = 0
  df.loc[((df.amenities.str.contains('conditioning')) == True), 
                                                      'air_conditioning'] = 1
  df.loc[((df.amenities.str.contains('conditioning')) == False), 
                                                      'air_conditioning'] = 0 
  df.loc[((df.amenities.str.contains('breakfast')) == True), 'breakfast'] = 1
  df.loc[((df.amenities.str.contains('breakfast')) == False), 'breakfast'] = 0
  df.loc[((df.amenities.str.contains('cable')) == True), 'cable'] = 1
  df.loc[((df.amenities.str.contains('cable')) == False), 'cable'] = 0
  df.loc[((df.amenities.str.contains('cat')) == True), 'cats_allowed'] = 1
  df.loc[((df.amenities.str.contains('cat')) == False), 'cats_allowed'] = 0
  df.loc[((df.amenities.str.contains('dog')) == True), 'dogs_allowed'] = 1
  df.loc[((df.amenities.str.contains('dog')) == False), 'dogs_allowed'] = 0
  df.loc[((df.amenities.str.contains('doorman')) == True), 'doorman'] = 1
  df.loc[((df.amenities.str.contains('doorman')) == False), 'doorman'] = 0
  df.loc[((df.amenities.str.contains('dryer')) == True), 'dryer'] = 1
  df.loc[((df.amenities.str.contains('dryer')) == False), 'dryer'] = 0
  df.loc[((df.amenities.str.contains('elevator')) == True), 'elevator'] = 1
  df.loc[((df.amenities.str.contains('elevator')) == False), 'elevator'] = 0
  df.loc[((df.amenities.str.contains('family/kid')) == True), 
                                                  'family/kid_friendly'] = 1
  df.loc[((df.amenities.str.contains('family/kid')) == False), 
                                                  'family/kid_friendly'] = 0
  df.loc[((df.amenities.str.contains('free parking')) == True), 
                                                  'free_parking'] = 1
  df.loc[((df.amenities.str.contains('free parking')) == False), 
                                                  'free_parking'] = 0
  df.loc[((df.amenities.str.contains('gym')) == True), 'gym'] = 1
  df.loc[((df.amenities.str.contains('gym')) == False), 'gym'] = 0
  df.loc[((df.amenities.str.contains('heating')) == True), 'heating'] = 1
  df.loc[((df.amenities.str.contains('heating')) == False), 'heating'] = 0
  df.loc[((df.amenities.str.contains('hot tub')) == True), 'hot_tub'] = 1
  df.loc[((df.amenities.str.contains('hot tub')) == False), 'hot_tub'] = 0
  df.loc[((df.amenities.str.contains('fireplace')) == True), 'fireplace'] = 1
  df.loc[((df.amenities.str.contains('fireplace')) == False), 'fireplace'] = 0
  df.loc[((df.amenities.str.contains('internet')) == True), 'internet'] = 1
  df.loc[((df.amenities.str.contains('internet')) == False), 'internet'] = 0
  df.loc[((df.amenities.str.contains('keypad')) == True), 'keypad'] = 1
  df.loc[((df.amenities.str.contains('keypad')) == False), 'keypad'] = 0
  df.loc[((df.amenities.str.contains('kitchen')) == True), 'kitchen'] = 1
  df.loc[((df.amenities.str.contains('kitchen')) == False), 'kitchen'] = 0
  df.loc[((df.amenities.str.contains('laptop')) == True), 
                                              'laptop_friendly_workspace'] = 1
  df.loc[((df.amenities.str.contains('laptop')) == False), 
                                              'laptop_friendly_workspace'] = 0
  df.loc[((df.amenities.str.contains('bedroom door')) == True), 
                                              'lock_on_bedroom_door'] = 1
  df.loc[((df.amenities.str.contains('bedroom door')) == False), 
                                              'lock_on_bedroom_door'] = 0
  df.loc[((df.amenities.str.contains('lockbox')) == True), 'lockbox'] = 1
  df.loc[((df.amenities.str.contains('lockbox')) == False), 'lockbox'] = 0
  df.loc[((df.amenities.str.contains('pet')) == True), 'pets_allowed'] = 1
  df.loc[((df.amenities.str.contains('pet')) == False), 'pets_allowed'] = 0
  df.loc[((df.amenities.str.contains('pool')) == True), 'pool'] = 1
  df.loc[((df.amenities.str.contains('pool')) == False), 'pool'] = 0
  df.loc[((df.amenities.str.contains('entrance')) == True), 
                                                  'private_entrance'] = 1
  df.loc[((df.amenities.str.contains('entrance')) == False), 
                                                  'private_entrance'] = 0
  df.loc[((df.amenities.str.contains('living room')) == True), 
                                                    'private_living_room'] = 1
  df.loc[((df.amenities.str.contains('living room')) == False), 
                                                    'private_living_room'] = 0
  df.loc[((df.amenities.str.contains('safety')) == True), 'safety_card'] = 1
  df.loc[((df.amenities.str.contains('safety')) == False), 'safety_card'] = 0
  df.loc[((df.amenities.str.contains('self')) == True), 'self_checkin'] = 1
  df.loc[((df.amenities.str.contains('self')) == False), 'self_checkin'] = 0
  df.loc[((df.amenities.str.contains('smartlock')) == True), 'smartlock'] = 1
  df.loc[((df.amenities.str.contains('smartlock')) == False), 'smartlock'] = 0
  df.loc[((df.amenities.str.contains('smoking')) == True), 
                                                    'smoking_allowed'] = 1
  df.loc[((df.amenities.str.contains('smoking')) == False), 
                                                    'smoking_allowed'] = 0
  df.loc[((df.amenities.str.contains('events')) == True), 'events?'] = 1
  df.loc[((df.amenities.str.contains('events')) == False), 'events?'] = 0
  df.loc[((df.amenities.str.contains('cable')) == True), 'cable'] = 1
  df.loc[((df.amenities.str.contains('cable')) == False), 'cable'] = 0
  df.loc[((df.amenities.str.contains('tv')) == True), 'tv'] = 1
  df.loc[((df.amenities.str.contains('tv')) == False), 'tv'] = 0
  df.loc[((df.amenities.str.contains('washer')) == True), 'washer'] = 1
  df.loc[((df.amenities.str.contains('washer')) == False), 'washer'] = 0
  df.loc[((df.amenities.str.contains('wheelchair')) == True), 
                                                    'wheelchair_access'] = 1
  df.loc[((df.amenities.str.contains('wheelchair')) == False), 
                                                    'wheelchair_access'] = 0
  # Convert to Int
  for amenity in amenities_list:
    df[amenity] = df[amenity].astype(int)
  
  # Drop Source Columns for Feature Engineering
  source_cols = ['amenities',]
  df = df.drop(columns=source_cols)

  # Drop Rows with NaNs
  df = df.dropna()

  return df

In [None]:
# Create DataFrame
df = wrangle('/content/airbnb-listings-london.csv')

In [None]:
print(f''' 
DF Shape: {df.shape}
DF Columns: {df.columns}
''')
df.head()

 
DF Shape: (26963, 67)
DF Columns: Index(['id', 'house_rules', 'host_since', 'host_same_day_response',
       'host_response_rate', 'host_listings_count', 'neighbourhood',
       'latitude', 'longitude', 'prop_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price', 'guests_inc',
       'xt_people', 'min_nights', 'max_nights', 'avail_30', 'avail_60',
       'avail_90', 'num_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'cancellation_policy',
       'reviews_per_month', '24_hr_checkin', 'air_conditioning', 'breakfast',
       'cable', 'cats_allowed', 'dogs_allowed', 'doorman', 'dryer', 'elevator',
       'family/kid_friendly', 'free_parking', 'gym', 'heating', 'hot_tub',
       'fireplace', 'internet', 'keypad', 'kitchen',
       'laptop_friendly_workspace', 'lock_on_bedroom_door'

Unnamed: 0,id,house_rules,host_since,host_same_day_response,host_response_rate,host_listings_count,neighbourhood,latitude,longitude,prop_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_inc,xt_people,min_nights,max_nights,avail_30,avail_60,avail_90,num_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,reviews_per_month,24_hr_checkin,air_conditioning,breakfast,cable,cats_allowed,dogs_allowed,doorman,dryer,elevator,family/kid_friendly,free_parking,gym,heating,hot_tub,fireplace,internet,keypad,kitchen,laptop_friendly_workspace,lock_on_bedroom_door,lockbox,pets_allowed,pool,private_entrance,private_living_room,safety_card,self_checkin,smartlock,smoking_allowed,events?,tv,washer,wheelchair_access
0,5570655,We love our flat and appreciate you leaving it...,2015-03-06,1.0,100.0,1,Hammersmith and Fulham,51.472087,-0.20638,Apartment,Entire Location,3,1.0,1.0,1.0,Real Bed,120.0,1,0,2,1125,17,39,69,14,96.0,10.0,9.0,10.0,10.0,10.0,9.0,flexible,0.61,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0
3,11632615,No smoking inside the house. No shoes policy...,2014-06-05,1.0,100.0,2,Hammersmith and Fulham,51.473387,-0.193706,House,Entire Location,6,4.0,3.0,3.0,Real Bed,350.0,7,0,3,1125,7,22,22,3,100.0,10.0,9.0,10.0,10.0,10.0,10.0,strict,1.36,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,7270274,This is a lovely private flat and we would ask...,2015-07-10,1.0,100.0,3,Hammersmith and Fulham,51.517149,-0.22553,Apartment,Entire Location,2,1.0,1.0,1.0,Real Bed,180.0,2,50,4,1125,29,59,89,4,100.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,0.23,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
5,6876434,none,2015-06-17,1.0,100.0,1,Hammersmith and Fulham,51.48937,-0.22,Bed & Breakfast,Private room,2,1.0,1.0,1.0,Real Bed,52.0,1,10,2,1125,27,57,87,39,94.0,9.0,10.0,10.0,10.0,9.0,9.0,flexible,1.92,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
7,450847,none,2011-11-08,1.0,100.0,1,Hammersmith and Fulham,51.513121,-0.225662,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,48.0,1,0,3,14,27,52,81,60,98.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate,1.06,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


## Pandas Profiling

The following will nee to be uncommented to generate the profile, and save the output as pandas profiling is not displaying the output as a widget properly. 

In [None]:
# #### Uncomment to Save #### # 
# profile = ProfileReport(df)
# profile.to_file(output_file='output.html')

## Other Exploration

In [None]:
df['host_listings_count'].value_counts()

1      21014
2       6785
3       2763
4       1426
5        827
       ...  
141        3
575        2
48         1
57         1
86         1
Name: host_listings_count, Length: 74, dtype: int64

In [None]:
df.columns
# Original
# Index(['ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Name', 'Summary',
#        'Space', 'Description', 'Experiences Offered', 'Neighborhood Overview',
#        'Notes', 'Transit', 'Access', 'Interaction', 'House Rules',
#        'Thumbnail Url', 'Medium Url', 'Picture Url', 'XL Picture Url',
#        'Host ID', 'Host URL', 'Host Name', 'Host Since', 'Host Location',
#        'Host About', 'Host Response Time', 'Host Response Rate',
#        'Host Acceptance Rate', 'Host Thumbnail Url', 'Host Picture Url',
#        'Host Neighbourhood', 'Host Listings Count',
#        'Host Total Listings Count', 'Host Verifications', 'Street',
#        'Neighbourhood', 'Neighbourhood Cleansed',
#        'Neighbourhood Group Cleansed', 'City', 'State', 'Zipcode', 'Market',
#        'Smart Location', 'Country Code', 'Country', 'Latitude', 'Longitude',
#        'Property Type', 'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms',
#        'Beds', 'Bed Type', 'Amenities', 'Square Feet', 'Price', 'Weekly Price',
#        'Monthly Price', 'Security Deposit', 'Cleaning Fee', 'Guests Included',
#        'Extra People', 'Minimum Nights', 'Maximum Nights', 'Calendar Updated',
#        'Has Availability', 'Availability 30', 'Availability 60',
#        'Availability 90', 'Availability 365', 'Calendar last Scraped',
#        'Number of Reviews', 'First Review', 'Last Review',
#        'Review Scores Rating', 'Review Scores Accuracy',
#        'Review Scores Cleanliness', 'Review Scores Checkin',
#        'Review Scores Communication', 'Review Scores Location',
#        'Review Scores Value', 'License', 'Jurisdiction Names',
#        'Cancellation Policy', 'Calculated host listings count',
#        'Reviews per Month', 'Geolocation', 'Features'],
#       dtype='object')

Index(['id', 'house_rules', 'host_since', 'host_same_day_response',
       'host_response_rate', 'host_listings_count', 'neighbourhood',
       'latitude', 'longitude', 'prop_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price', 'guests_inc',
       'xt_people', 'min_nights', 'max_nights', 'avail_30', 'avail_60',
       'avail_90', 'num_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'cancellation_policy', 'reviews_per_month',
       '24_hr_checkin', 'air_conditioning', 'breakfast', 'cable',
       'cats_allowed', 'dogs_allowed', 'doorman', 'dryer', 'elevator',
       'family/kid_friendly', 'free_parking', 'gym', 'heating', 'hot_tub',
       'fireplace', 'internet', 'keypad', 'kitchen',
       'laptop_friendly_workspace', 'lock_on_bedroom_doo

In [None]:
df['house_rules']

0        We love our flat and appreciate you leaving it...
1        No parties or loud music , guest can smoke on ...
2                                                     none
3        No smoking inside the house.   No shoes policy...
4        This is a lovely private flat and we would ask...
                               ...                        
41633    - The price is intended per night - This is a ...
41634    NO GATHERINGS, PARTIES OR BUSINESS OF ANY KIND...
41635                 No Smoking No Parties  No Gatherings
41636                                                 none
41637    Rental Agreement Please read this Rental Agree...
Name: house_rules, Length: 41637, dtype: object

In [None]:
df['bed_type'].value_counts()

Real Bed         40821
Pull-out Sofa      495
Futon              199
Couch               85
Airbed              37
Name: bed_type, dtype: int64

In [None]:
df['room_type'].value_counts()

Entire Location    22496
Private room       18671
Shared room          470
Name: room_type, dtype: int64

In [None]:
df['room_type'].value_counts()

Entire Location    22496
Private room       18671
Shared room          470
Name: room_type, dtype: int64

In [None]:
df['cancellation_policy'].value_counts()

strict      18741
flexible    12636
moderate    10260
Name: cancellation_policy, dtype: int64

In [None]:
print(type(df['last_review'][0]), type(df['first_review'][0]))

<class 'str'> <class 'str'>


In [None]:
df['last_review'].isnull().sum()

8953

In [None]:
df['last_review'].value_counts()

2017-02-26    1380
2017-01-01     962
2017-01-02     923
2017-02-19     843
2017-02-27     757
              ... 
2014-07-11       1
2014-01-04       1
2013-11-06       1
2015-01-15       1
2014-05-16       1
Name: last_review, Length: 980, dtype: int64

In [None]:
# (df['host_listings_count'] == df['host_total_listings_count']).value_counts()
# True    41637
# dtype: int64

# Data Visualizations

# Tokenize and Vectorize House Rules

## Tokenize

In [None]:
# Save Pretrained Spacy Model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Clean and Tokenize House Rules
df['house_rules'] = df['house_rules'].apply(clean_text)
df['house_rules_tokens'] = df['house_rules'].apply(tokenize)

In [None]:
df['house_rules']

0        we love our flat and appreciate you leaving it...
1        no parties or loud music guest can smoke on th...
2                                                     none
3        no smoking inside the house no shoes policy wh...
4        this is a lovely private flat and we would ask...
                               ...                        
41633     the price is intended per night this is a str...
41634    no gatherings parties or business of any kind ...
41635                  no smoking no parties no gatherings
41636                                                 none
41637    rental agreement please read this rental agree...
Name: house_rules, Length: 41637, dtype: object

In [None]:
df['house_rules_tokens']

0        [love, flat, appreciate, leaving, find, respec...
1        [parties, loud, music, guest, smoke, small, ba...
2                                                       []
3        [smoking, inside, house, shoes, policy, leavin...
4                 [lovely, private, flat, ask, way, found]
                               ...                        
41633    [price, intended, night, strictly, non, smokin...
41634    [gatherings, parties, business, kind, allowed,...
41635                       [smoking, parties, gatherings]
41636                                                   []
41637    [rental, agreement, read, rental, agreement, c...
Name: house_rules_tokens, Length: 41637, dtype: object

In [None]:
df.columns

Index(['id', 'house_rules', 'host_since', 'host_same_day_response',
       'host_response_rate', 'host_listings_count', 'neighbourhood',
       'latitude', 'longitude', 'prop_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price', 'guests_inc',
       'xt_people', 'min_nights', 'max_nights', 'avail_30', 'avail_60',
       'avail_90', 'num_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'cancellation_policy', 'reviews_per_month',
       '24_hr_checkin', 'air_conditioning', 'breakfast', 'cable',
       'cats_allowed', 'dogs_allowed', 'doorman', 'dryer', 'elevator',
       'family/kid_friendly', 'free_parking', 'gym', 'heating', 'hot_tub',
       'fireplace', 'internet', 'keypad', 'kitchen',
       'laptop_friendly_workspace', 'lock_on_bedroom_doo

## Vectorize

In [None]:
# Create a Vector Representation of House Rules

# Instantiate 
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Create Vocabulary & Get Word Counts per Document
dtm = tfidf.fit_transform(df['house_rules'])

# Create dtm df
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names_out())

In [None]:
print(f'dtm shape: {dtm.shape}')
dtm

dtm shape: (41637, 5000)


Unnamed: 0,00,000,00am,00h,00hr,00hrs,00p,00pm,01,06,07,08,0800,09,0sf,0sp,10,100,1000,101,1030pm,107,10am,10m,10pm,11,110,1100,111,1130pm,11a,11am,11pm,12,120,120cm,12am,12midnight,12noon,12pm,...,words,work,worked,worker,workers,working,works,workspace,worktop,worktops,world,worn,worries,worry,worse,worth,wotsapp,wouldn,write,writing,written,wrong,www,x1,xx,yard,year,years,yellow,yes,yo,yoga,yogurt,yorkshire,young,younger,yrs,yup,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091594,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Visualize Vectorization

# Create DF of top 20 Words with Reset Index Suitable for Plotly
top_20_words = dtm.sum(axis=0).sort_values(ascending=False).head(20).to_frame()
top_20_words = top_20_words.reset_index()
top_20_words.columns = ['Word', 'Count']

# Plot
top_20_fig = px.bar(
                    top_20_words,
                    y='Word',
                    x='Count',
                    color='Count',
                    )

# Styling
top_20_fig.update_layout(
                        title={
                            'text':'Top 20 Words',
                            'y':0.9,
                            'x':0.5,
                            'xanchor':'center',
                            'yanchor':'top'
                        },
                         template='plotly_dark'
                         )

# Model Building

## Manual Encoding

In [None]:
df.head()

Unnamed: 0,id,house_rules,host_since,host_same_day_response,host_response_rate,host_listings_count,neighbourhood,latitude,longitude,prop_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_inc,xt_people,min_nights,max_nights,avail_30,avail_60,avail_90,num_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,reviews_per_month,24_hr_checkin,air_conditioning,breakfast,cable,cats_allowed,dogs_allowed,doorman,dryer,elevator,family/kid_friendly,free_parking,gym,heating,hot_tub,fireplace,internet,keypad,kitchen,laptop_friendly_workspace,lock_on_bedroom_door,lockbox,pets_allowed,pool,private_entrance,private_living_room,safety_card,self_checkin,smartlock,smoking_allowed,events?,tv,washer,wheelchair_access
0,5570655,We love our flat and appreciate you leaving it...,2015-03-06,1.0,100.0,1,Hammersmith and Fulham,51.472087,-0.20638,Apartment,Entire Location,3,1.0,1.0,1.0,Real Bed,120.0,1,0,2,1125,17,39,69,14,96.0,10.0,9.0,10.0,10.0,10.0,9.0,flexible,0.61,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0
3,11632615,No smoking inside the house. No shoes policy...,2014-06-05,1.0,100.0,2,Hammersmith and Fulham,51.473387,-0.193706,House,Entire Location,6,4.0,3.0,3.0,Real Bed,350.0,7,0,3,1125,7,22,22,3,100.0,10.0,9.0,10.0,10.0,10.0,10.0,strict,1.36,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,7270274,This is a lovely private flat and we would ask...,2015-07-10,1.0,100.0,3,Hammersmith and Fulham,51.517149,-0.22553,Apartment,Entire Location,2,1.0,1.0,1.0,Real Bed,180.0,2,50,4,1125,29,59,89,4,100.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,0.23,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
5,6876434,none,2015-06-17,1.0,100.0,1,Hammersmith and Fulham,51.48937,-0.22,Bed & Breakfast,Private room,2,1.0,1.0,1.0,Real Bed,52.0,1,10,2,1125,27,57,87,39,94.0,9.0,10.0,10.0,10.0,9.0,9.0,flexible,1.92,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
7,450847,none,2011-11-08,1.0,100.0,1,Hammersmith and Fulham,51.513121,-0.225662,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,48.0,1,0,3,14,27,52,81,60,98.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate,1.06,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


In [None]:
df.columns

Index(['id', 'house_rules', 'host_since', 'host_same_day_response',
       'host_response_rate', 'host_listings_count', 'neighbourhood',
       'latitude', 'longitude', 'prop_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price', 'guests_inc',
       'xt_people', 'min_nights', 'max_nights', 'avail_30', 'avail_60',
       'avail_90', 'num_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'cancellation_policy',
       'reviews_per_month', '24_hr_checkin', 'air_conditioning', 'breakfast',
       'cable', 'cats_allowed', 'dogs_allowed', 'doorman', 'dryer', 'elevator',
       'family/kid_friendly', 'free_parking', 'gym', 'heating', 'hot_tub',
       'fireplace', 'internet', 'keypad', 'kitchen',
       'laptop_friendly_workspace', 'lock_on_bedroom_door', 'lockbox',
       'pets_allowed', 

In [None]:
df['neighbourhood'].value_counts()

Westminster               3419
Tower Hamlets             2986
Hackney                   2284
Camden                    2255
Kensington and Chelsea    2136
Southwark                 1935
Islington                 1876
Lambeth                   1753
Hammersmith and Fulham    1560
Wandsworth                1353
Lewisham                   719
Haringey                   671
Brent                      654
Newham                     595
Ealing                     464
Greenwich                  458
Waltham Forest             447
Barnet                     349
Merton                     277
City of London             192
Hounslow                   154
Other                       98
Enfield                     89
Richmond upon Thames        84
Croydon                     82
Bromley                     73
Name: neighbourhood, dtype: int64

In [None]:
df['prop_type'].value_counts()

Apartment          20555
House               5661
Bed & Breakfast      511
Other                176
Boat                  35
Hotel                 18
Camping                7
Name: prop_type, dtype: int64

In [None]:
df['room_type'].value_counts()

Entire Location    14922
Private room       11725
Shared room          316
Name: room_type, dtype: int64

In [None]:
df['bed_type'].value_counts()

Real Bed         26414
Pull-out Sofa      354
Futon              121
Couch               60
Airbed              14
Name: bed_type, dtype: int64

In [None]:
def encode_df(df):
  'Takes a DataFrame and Returns it with encoded data'

  # Drop Unused Columns
  # Existing Host ONLY Columns
  cols_to_drop = [
                  'id', 'host_since', 'host_same_day_response',
                  'host_response_rate', 'num_reviews', 'review_scores_rating', 
                  'review_scores_accuracy', 'review_scores_cleanliness', 
                  'review_scores_checkin', 'review_scores_communication',
                  'review_scores_location', 'review_scores_value', 
                  'reviews_per_month',
                  ]
  df = df.drop(columns=cols_to_drop)

  if 'house_rules' in df.columns:
    df = df.drop(columns='house_rules')
  if 'house_rules_tokens' in df.columns:
    df = df.drop(columns='house_rules_tokens')

  # Encode Each String Variable with a Corresponding Number
  df['neighbourhood'] = df['neighbourhood'].replace({
      'Tower Hamlets': 1,
      'Westminster': 2,
      'Hackney': 3,
      'Camden': 4,
      'Islington': 5,
      'Kensington and Chelsea': 6,
      'Southwark': 7,
      'Lambeth': 8,
      'Hammersmith and Fulham': 9,
      'Wandsworth': 10,
      'Haringey': 11,
      'Lewisham': 12,
      'Brent': 13,
      'Newham': 14,
      'Waltham Forest': 15,
      'Ealing': 16,
      'Greenwich': 17,
      'Barnet': 18,
      'Merton': 19,
      'City of London': 20,
      'Hounslow': 21,
      'Richmond upon Thames': 22,
      'Croydon': 23,
      'Enfield': 24,
      'Bromley': 25,
      'Other': 26,
      })

  df['prop_type'] = df['prop_type'].replace({
      'Apartment': 1,
      'Bed & Breakfast': 2,
      'Boat': 3,
      'Camping': 4,
      'Hotel': 5,
      'House': 6,
      'Other': 7, 
      })
  
  df['room_type'] = df['room_type'].replace({
      'Entire Location': 1,
      'Private Room': 2,
      'Shared Room': 3,
      })
  
  df['room_type'] = df['room_type'].replace({
      'Entire Location': 1,
      'Private room': 2,
      'Shared room': 3,
      })
  
  df['bed_type'] = df['bed_type'].replace({
      'Real Bed': 1,
      'Airbed': 2,
      'Couch': 3,
      'Futon': 4,
      'Pull-out Sofa': 5,
      })
  
  df['cancellation_policy'] = df['cancellation_policy'].replace({
      'strict': 1,
      'flexible': 2,
      'moderate': 3,
      })
  
  return df

In [None]:
df_encoded = encode_df(df)

In [None]:
df_encoded.head()

Unnamed: 0,host_listings_count,neighbourhood,latitude,longitude,prop_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_inc,xt_people,min_nights,max_nights,avail_30,avail_60,avail_90,cancellation_policy,24_hr_checkin,air_conditioning,breakfast,cable,cats_allowed,dogs_allowed,doorman,dryer,elevator,family/kid_friendly,free_parking,gym,heating,hot_tub,fireplace,internet,keypad,kitchen,laptop_friendly_workspace,lock_on_bedroom_door,lockbox,pets_allowed,pool,private_entrance,private_living_room,safety_card,self_checkin,smartlock,smoking_allowed,events?,tv,washer,wheelchair_access
0,1,9,51.472087,-0.20638,1,1,3,1.0,1.0,1.0,1,120.0,1,0,2,1125,17,39,69,2,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0
3,2,9,51.473387,-0.193706,6,1,6,4.0,3.0,3.0,1,350.0,7,0,3,1125,7,22,22,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,3,9,51.517149,-0.22553,1,1,2,1.0,1.0,1.0,1,180.0,2,50,4,1125,29,59,89,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
5,1,9,51.48937,-0.22,2,2,2,1.0,1.0,1.0,1,52.0,1,10,2,1125,27,57,87,2,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
7,1,9,51.513121,-0.225662,1,2,1,1.0,1.0,1.0,1,48.0,1,0,3,14,27,52,81,3,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


## Train Test Split

In [None]:
# Create Train/Test Sets for Each Host Type

# Set Target and Features
target = 'price'
# New Host
y = df_encoded[target]
X = df_encoded.drop(columns=target)


In [None]:
# Scale Data
# Instantiate Scaler
scaler = StandardScaler()

# Scale X
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))

In [None]:
# Train Test Split for Each Host Type

# New Host
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, 
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42,)

## Neural Network Models

In [865]:
# Define Early Stopping for Models
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

### Model 1

In [866]:
# Instantiate Model
model1 = models.Sequential()
# Input Layer
model1.add(layers.Dense(
                        256,
                        activation = 'relu',
                        input_dim=(X_train.shape[1]),
                        kernel_regularizer=l1(0.1),
                        ))
# Hidden Layers
model1.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l1(0.1),
                        ))
model1.add(layers.Dense(
                        64,
                        activation = 'relu',
                        kernel_regularizer=l1(0.1),
                        ))
# Output Layer
model1.add(layers.Dense(
                        1,
                        activation = 'linear',
                        kernel_regularizer=l1(0.1), 
                        ))

# Compile
model1.compile(
              loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_squared_error', 'mean_absolute_error']
              )

In [867]:
# Model Summary
model1.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_72 (Dense)            (None, 256)               13568     
                                                                 
 dense_73 (Dense)            (None, 128)               32896     
                                                                 
 dense_74 (Dense)            (None, 64)                8256      
                                                                 
 dense_75 (Dense)            (None, 1)                 65        
                                                                 
Total params: 54,785
Trainable params: 54,785
Non-trainable params: 0
_________________________________________________________________


In [870]:
model1_history = model1.fit(
                       X_train,
                       y_train,
                       epochs=500,
                       batch_size=32,
                       callbacks=callback
                        )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500


### Model 2

In [871]:
# Instantiate Model
model2 = models.Sequential()
# Input Layer
model2.add(layers.Dense(
                        256,
                        activation = 'relu',
                        input_dim=(X_train.shape[1]),
                        kernel_regularizer=l1(0.1),
                        ))
# Hidden Layers
model2.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l1(0.1),
                        ))
model2.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l1(0.1),
                        ))
model2.add(layers.Dense(
                        64,
                        activation = 'relu',
                        kernel_regularizer=l1(0.1),
                        ))
# Output Layer
model2.add(layers.Dense(
                        1,
                        activation = 'linear',
                        kernel_regularizer=l1(0.1), 
                        ))

# Compile
model2.compile(
              loss='huber_loss',
              optimizer='adam',
              metrics=['mean_squared_error', 'mean_absolute_error']
              )

In [872]:
model2.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_76 (Dense)            (None, 256)               13568     
                                                                 
 dense_77 (Dense)            (None, 128)               32896     
                                                                 
 dense_78 (Dense)            (None, 128)               16512     
                                                                 
 dense_79 (Dense)            (None, 64)                8256      
                                                                 
 dense_80 (Dense)            (None, 1)                 65        
                                                                 
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [873]:
model2_history = model2.fit(
                       X_train,
                       y_train,
                       epochs=500,
                       batch_size=32,
                       callbacks=callback
                        )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500


### Model 3

In [874]:
# Instantiate Model
model3 = models.Sequential()
# Input Layer
model3.add(layers.Dense(
                        256,
                        activation = 'relu',
                        input_dim=(X_train.shape[1]),
                        kernel_regularizer=l2(0.1),
                        ))
# Hidden Layers
model3.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l2(0.1),
                        ))
model3.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l2(0.1),
                        ))
model3.add(layers.Dense(
                        64,
                        activation = 'relu',
                        kernel_regularizer=l2(0.1),
                        ))
# Output Layer
model3.add(layers.Dense(
                        1,
                        activation = 'linear',
                        kernel_regularizer=l2(0.1), 
                        ))

# Compile
model3.compile(
              loss='huber_loss',
              optimizer='adam',
              metrics=['mean_squared_error', 'mean_absolute_error']
              )

In [875]:
model3.summary()

Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_81 (Dense)            (None, 256)               13568     
                                                                 
 dense_82 (Dense)            (None, 128)               32896     
                                                                 
 dense_83 (Dense)            (None, 128)               16512     
                                                                 
 dense_84 (Dense)            (None, 64)                8256      
                                                                 
 dense_85 (Dense)            (None, 1)                 65        
                                                                 
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [876]:
model3_history = model3.fit(
                       X_train,
                       y_train,
                       epochs=500,
                       batch_size=32,
                       callbacks=callback
                        )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500


### Model 4

In [879]:
# Instantiate Model
model4 = models.Sequential()
# Input Layer
model4.add(layers.Dense(
                        256,
                        activation = 'relu',
                        input_dim=(X_train.shape[1]),
                        kernel_regularizer=l2(0.1),
                        ))
# Hidden Layers
model4.add(layers.Dense(
                        128,
                        activation = 'relu',
                        kernel_regularizer=l2(0.1),
                        ))
model4.add(layers.Dense(
                        64,
                        activation = 'relu',
                        kernel_regularizer=l2(0.1),
                        ))
# Output Layer
model4.add(layers.Dense(
                        1,
                        activation = 'linear',
                        kernel_regularizer=l2(0.1), 
                        ))

# Compile
model4.compile(
              loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_squared_error', 'mean_absolute_error']
              )

In [880]:
model4.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_86 (Dense)            (None, 256)               13568     
                                                                 
 dense_87 (Dense)            (None, 128)               32896     
                                                                 
 dense_88 (Dense)            (None, 64)                8256      
                                                                 
 dense_89 (Dense)            (None, 1)                 65        
                                                                 
Total params: 54,785
Trainable params: 54,785
Non-trainable params: 0
_________________________________________________________________


In [881]:
model4_history = model4.fit(
                       X_train,
                       y_train,
                       epochs=500,
                       batch_size=32,
                       callbacks=callback
                        )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500


### Save Best Model

In [886]:
# Save Model to Disk
model4.save('/Users/xlusc/Downloads')

INFO:tensorflow:Assets written to: /Users/xlusc/Downloads/assets
