# Data Loading and Cleaning

This data is very large, and must be cleaned.

* NaN values must be replaced

* values must be converted into either floats, strings, or integers (check Dtype Warning)

## Data Cleaning

In [115]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [116]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import joblib
from joblib import dump
import scipy
!pip install category_encoders



In [117]:
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

### Loading Data

In [118]:
df = pd.read_csv('/content/drive/My Drive/airbnb_listings_usa.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Bathroom Cleaning(**DONE**)

In [119]:
df['bathrooms'].isna().sum()
# 206 nans in bathrooms. These will get replaced by 0's

206

In [120]:
df['bathrooms'] = df['bathrooms'].fillna(0)

In [121]:
df['bathrooms'].isna().sum()
# Nan's fixed for this column. Now make sure this can get casted to floats

0

In [122]:
df['bathrooms'].unique()
# all unique values of bathrooms reveals that there's "t" somewhere in there.
# going to clean this and then convert

array(['1.0', '2.0', '2.5', '1.5', '3.0', '12.5', '0.5', '3.5', '4.5',
       '4.0', '0.0', '7.5', '6.5', '5.0', '7.0', '6.0', 0, 't', '9.0',
       '5.5', '8.0', 1.0, 2.0, 2.5, 3.0, 4.5, 1.5, 5.5, 3.5, 4.0, 5.0,
       6.0, 6.5, 7.0, 11.0, 17.0, 0.5, 8.0, 7.5, 20.0, 9.0, 8.5, 11.5,
       21.0, 12.0, 10.0, 12.5, 14.0, 9.5, 10.5, 13.0, 13.5, '13.5',
       '11.0', 15.0, 16.0, '8.5', '15.0', '12.0', '9.5', '14.0', '10.5',
       '19.0', '24.0', '17.0', '16.0', '10.0', '37.0', 14.5, '15.5',
       '27.5', 33.5, 50.0], dtype=object)

In [123]:
df['bathrooms'] = df['bathrooms'].replace('t', 0)

In [124]:
df['bathrooms'].unique()
# all good! going to cast bathrooms as float now

array(['1.0', '2.0', '2.5', '1.5', '3.0', '12.5', '0.5', '3.5', '4.5',
       '4.0', '0.0', '7.5', '6.5', '5.0', '7.0', '6.0', 0, '9.0', '5.5',
       '8.0', 1.0, 2.0, 2.5, 3.0, 4.5, 1.5, 5.5, 3.5, 4.0, 5.0, 6.0, 6.5,
       7.0, 11.0, 17.0, 0.5, 8.0, 7.5, 20.0, 9.0, 8.5, 11.5, 21.0, 12.0,
       10.0, 12.5, 14.0, 9.5, 10.5, 13.0, 13.5, '13.5', '11.0', 15.0,
       16.0, '8.5', '15.0', '12.0', '9.5', '14.0', '10.5', '19.0', '24.0',
       '17.0', '16.0', '10.0', '37.0', 14.5, '15.5', '27.5', 33.5, 50.0],
      dtype=object)

In [125]:
df['bathrooms'] = df['bathrooms'].astype('float')

In [126]:
df['bathrooms'].dtypes
# SUCCESS ! One column cleaned. 

dtype('float64')

### Zip Cleaning(**DONE**)

In [127]:
df.columns[34]

'zipcode'

In [128]:
df['zipcode'].unique()

array(['28804.0', '28801.0', '28806.0', ..., 'DC 20002', 'DC 20006',
       '22312'], dtype=object)

In [129]:
extr = df['zipcode'].str.extract(r'^(\d{4})', expand=False)

In [130]:
df['zipcode'] = extr

In [131]:
df['zipcode'] = df['zipcode'].fillna(0)

In [132]:
df['zipcode'] = df['zipcode'].astype('int')

In [133]:
df['zipcode'].unique()

array([2880,    0, 2871, 2870, 2873, 2881, 2874, 7870, 7872, 7874, 7875,
       7873, 7871, 7865, 7862, 7866, 7868, 7876, 7861,  212,  211,  210,
        213,  221,  246,  216, 3313,  244, 2135, 2116,  219,  214, 3330,
       3301, 3300, 3333, 3302, 3344, 3306, 3307, 3332, 3331, 3335, 3314,
       3343, 3305, 3316,  247, 6063, 6060, 6062, 6061, 6064, 6065, 6066,
       6030, 6041, 6070, 6080, 6053, 6082, 5650, 6020, 6045, 8910, 8917,
       8913, 8912, 8914, 8911, 8901, 8916, 8903, 8918, 8915, 8900, 8908,
       8907, 8902, 8905, 8904, 8644, 8021, 8020, 8023, 8022, 8024, 8012,
       8001, 8011, 8003, 8142, 9672, 9674, 9670, 9675, 9677, 9671, 9679,
       9676, 9678, 9681, 9673, 9682, 9687, 9722, 9626, 1682, 3764, 9575,
       9765, 9764, 9209, 1114, 9861, 9567, 9683, 9081,  730,  703,  731,
        708,  704, 1000, 1003,  700, 1220, 9023, 9150, 9004, 9040, 9070,
       9006, 9136, 9003, 9029, 9080, 9027, 9001, 9024, 9050, 9160, 9002,
       9120, 9000, 9026, 9140, 9134, 9021, 9131, 90

In [134]:
df['zipcode'].dtypes

dtype('int64')

### Square Feet Cleaning(**DONE**)

In [135]:
df.columns[50]

'square_feet'

In [136]:
df['square_feet'] = df['square_feet'].fillna(0)

In [137]:
things_to_drop = ['2020-05-16', '2020-05-08', '2020-05-18', '2020-06-09', '2020-06-08', 
                  '2020-05-21']

In [138]:
df['square_feet'] = df['square_feet'].replace(to_replace =things_to_drop,  
                            value =100) 

In [139]:
df['square_feet'] = df['square_feet'].astype('float')

In [140]:
df['square_feet'].dtypes

dtype('float64')

### Number of reviews Cleaning(**DONE**)

In [141]:
df.columns[66]

'number_of_reviews'

In [142]:
df['number_of_reviews'].unique()

array(['138', '108', '89', ..., 517, 424, 562], dtype=object)

In [143]:
extrac = df['number_of_reviews'].str.extract(r'^(\d{4})', expand=False)

In [144]:
df['number_of_reviews'] = extrac

In [145]:
df['number_of_reviews'] = df['number_of_reviews'].fillna(0)

In [146]:
df['number_of_reviews'] = df['number_of_reviews'].astype('int')

In [147]:
df['number_of_reviews'].dtypes

dtype('int64')

### Number of reviews LTM cleaning(**DONE**)


In [148]:
 df.columns[67]

'number_of_reviews_ltm'

In [149]:
extractr = df['number_of_reviews_ltm'].str.extract(r'^(\d{4})', expand=False)
df['number_of_reviews_ltm'] = extractr
df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].fillna(0)
df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].astype('int')

In [150]:
df['number_of_reviews_ltm'].dtypes

dtype('int64')

### Converting the % columns to floats(**DONE**)

In [151]:
df['host_response_rate'] = df['host_response_rate'].fillna(0)

In [152]:
df['host_response_rate'].unique()

array([0, '100%', '80%', '89%', '83%', '90%', '99%', '0%', '50%', '75%',
       '93%', '94%', '11%', '67%', '91%', '95%', '86%', '70%', '20%',
       '88%', '97%', '60%', '98%', '96%', '57%', '92%', '82%', '33%',
       '40%', 'Austin', '71%', '53%', '13%', '85%', '44%', '78%', '87%',
       '39%', '58%', '77%', '73%', '56%', '84%', '81%', '30%', '10%',
       '43%', '29%', '64%', '55%', '62%', '38%', '25%', '63%', '79%',
       '61%', '66%', '74%', '54%', '72%', '59%', '18%', '76%', '41%',
       '42%', '4%', '52%', '37%', '68%', '27%', '16%', '17%', '14%',
       '19%', '49%', '22%', '9%', '69%', '48%', '47%', '3%', '35%', '8%',
       '26%', '65%', '15%', '36%', '46%', '51%', '31%', '21%', '45%',
       '5%', '7%', 'Los Angeles', '23%', '12%', 'Nashville', '32%',
       'Queens', 'New York', '2%', '6%', 'San Diego', '34%', 'Seattle'],
      dtype=object)

In [153]:
things_to_drop2 = ['Los Angeles', 'Austin', 'Nashville', 'Queens', 'New York', 'San Diego', 
                  'Seattle']

In [154]:
df['host_response_rate'] = df['host_response_rate'].replace(to_replace =things_to_drop2,  
                            value =0) 

In [155]:
df['host_response_rate'].unique()

array([0, '100%', '80%', '89%', '83%', '90%', '99%', '0%', '50%', '75%',
       '93%', '94%', '11%', '67%', '91%', '95%', '86%', '70%', '20%',
       '88%', '97%', '60%', '98%', '96%', '57%', '92%', '82%', '33%',
       '40%', '71%', '53%', '13%', '85%', '44%', '78%', '87%', '39%',
       '58%', '77%', '73%', '56%', '84%', '81%', '30%', '10%', '43%',
       '29%', '64%', '55%', '62%', '38%', '25%', '63%', '79%', '61%',
       '66%', '74%', '54%', '72%', '59%', '18%', '76%', '41%', '42%',
       '4%', '52%', '37%', '68%', '27%', '16%', '17%', '14%', '19%',
       '49%', '22%', '9%', '69%', '48%', '47%', '3%', '35%', '8%', '26%',
       '65%', '15%', '36%', '46%', '51%', '31%', '21%', '45%', '5%', '7%',
       '23%', '12%', '32%', '2%', '6%', '34%'], dtype=object)

In [156]:
df['host_response_rate'] = df['host_response_rate'].apply(lambda x: float(x.strip('%'))/100 if isinstance(x, str) else x )

In [157]:
extractr = df['host_acceptance_rate'].str.extract(r'^(\d{4})', expand=False)
df['host_acceptance_rate'] = extractr
df['host_acceptance_rate'] = df['host_acceptance_rate'].fillna(0)
df['host_acceptance_rate'] = df['host_acceptance_rate'].astype('int')

In [158]:
print(df['host_response_rate'].dtypes)

float64


In [159]:
print(df['host_acceptance_rate'].dtypes)

int64


### Convert the price to float(**DONE**)

In [160]:
df['price'] = df['price'].fillna(0)

In [161]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

In [162]:
df['price'] = df['price'].apply(clean_currency).astype('float')

In [163]:
df['price'].dtypes

dtype('float64')

### Converting text columns to word count columns(**DONE**)

In [164]:
df['name_len'] = df.name.astype(str).apply(lambda x: len(x))
df['summary_len'] = df.summary.astype(str).apply(lambda x: len(x))
df['space_len'] = df.space.astype(str).apply(lambda x: len(x))
df['description_len'] = df.description.astype(str).apply(lambda x: len(x))
df['neighborhood_overview_len'] = df.neighborhood_overview.astype(str).apply(lambda x: len(x))
df['notes_len'] = df.notes.astype(str).apply(lambda x: len(x))
df['transit_len'] = df.transit.astype(str).apply(lambda x: len(x))
df['access_len'] = df.access.astype(str).apply(lambda x: len(x))
df['interaction_len'] = df.interaction.astype(str).apply(lambda x: len(x))
df['house_rules_len'] = df.house_rules.astype(str).apply(lambda x: len(x))
df['host_about_len'] = df.host_about.astype(str).apply(lambda x: len(x))

In [165]:
long_text_columns = ['name','summary','space','description','neighborhood_overview','notes','transit','access','interaction','house_rules','host_about']

In [166]:
df = df.drop(long_text_columns, axis=1)

### Dropping Columns due to Unique values, leaky values, Dtype errors, etc.

In [167]:
var_drop = ['Unnamed: 0', 'host_listings_count', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
 'longitude', 'weekly_price', 'monthly_price', 'availability_60', 'availability_90', 'availability_365',
  'review_scores_communication', 'jurisdiction_names','calculated_host_listings_count' ,'calculated_host_listings_count_entire_homes','require_guest_profile_picture' ,'id',
   'picture_url', 'host_since', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'security_deposit', 'cleaning_fee',
    'extra_people', 'has_availability', 'availability_30', 'calendar_last_scraped', 'first_review', 
    'last_review', 'host_location', 'smart_location', 'country_code', 'country', 'is_location_exact']

In [168]:
def wrangle(X):
    """Wrangle everything in the same way"""
    
    # Prevent SettingWithCopyWarning
    X = X.copy()

    # Drop the columns with 0 feature importance
    more_variance = []

    X = X.drop(columns=var_drop)

    

    # Engineer features!
    
    # return the wrangled dataframe
    return X


In [169]:
df2 = wrangle(df)

In [170]:
for col in df2.columns:
  print(col)
print(df2.shape)

host_response_time
host_response_rate
host_acceptance_rate
host_is_superhost
host_total_listings_count
host_verifications
host_has_profile_pic
host_identity_verified
street
neighbourhood
city
state
zipcode
market
property_type
room_type
accommodates
bathrooms
bedrooms
beds
bed_type
amenities
square_feet
price
guests_included
minimum_nights
maximum_nights
number_of_reviews
number_of_reviews_ltm
review_scores_rating
review_scores_accuracy
review_scores_cleanliness
review_scores_checkin
review_scores_location
review_scores_value
requires_license
instant_bookable
is_business_travel_ready
cancellation_policy
require_guest_phone_verification
calculated_host_listings_count_private_rooms
calculated_host_listings_count_shared_rooms
reviews_per_month
filename
name_len
summary_len
space_len
description_len
neighborhood_overview_len
notes_len
transit_len
access_len
interaction_len
house_rules_len
host_about_len
(243686, 55)


In [171]:
df2.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,city,state,zipcode,market,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,guests_included,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_location,review_scores_value,requires_license,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_phone_verification,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,filename,name_len,summary_len,space_len,description_len,neighborhood_overview_len,notes_len,transit_len,access_len,interaction_len,house_rules_len,host_about_len
0,,0.0,0,t,1.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,f,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,House,Private room,2.0,1.0,1.0,2.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",0.0,50.0,1.0,1.0,3.0,0,0,96.0,10.0,10.0,10.0,10.0,10.0,f,f,f,moderate,t,1.0,0.0,1.18,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,47,472,1000,1000,810,425,53,524,354,1000,1377
1,within an hour,1.0,0,t,14.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Loft,Entire home/apt,12.0,2.0,3.0,6.0,Real Bed,"{TV,Internet,Wifi,""Air conditioning"",Kitchen,""...",0.0,765.0,6.0,1.0,100.0,0,0,96.0,10.0,10.0,10.0,10.0,9.0,f,t,f,super_strict_60,f,0.0,0.0,1.01,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,16,489,876,1000,364,87,107,3,98,1000,108
2,within an hour,1.0,0,f,2.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,t,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Apartment,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,"{Wifi,""Air conditioning"",Kitchen,""Free parking...",0.0,75.0,2.0,30.0,365.0,0,0,90.0,9.0,9.0,10.0,10.0,9.0,f,f,f,strict_14_with_grace_period,f,0.0,0.0,0.84,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,50,491,1000,1000,281,3,190,82,324,181,116
3,within an hour,1.0,0,t,7.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,f,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Guesthouse,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",0.0,107.0,2.0,1.0,365.0,0,0,90.0,10.0,9.0,10.0,10.0,9.0,f,t,f,moderate,f,2.0,4.0,2.23,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,34,3,467,967,192,72,77,83,71,166,221
4,within an hour,1.0,0,t,7.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,f,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,House,Private room,2.0,2.5,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Free parking on premis...",0.0,71.0,2.0,1.0,365.0,0,0,90.0,10.0,9.0,10.0,10.0,9.0,f,t,f,moderate,f,2.0,4.0,0.63,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,33,3,570,682,72,3,38,3,3,143,221


At this point, the Dtype warning has been taken care of. The dataframe (DF2) should now only consist of single dtype objects per column. This DF must now be passed through some transformations to do the following:

* Remove NaN values
* Encode Categorical Variables
* Make "t" = 1 and "f" = 0

## Transformers for the DF.


### Changing the Trues and Falses(**DONE**)

In [172]:
df2['host_is_superhost'] = (df2['host_is_superhost'] == 't')
df2['host_has_profile_pic'] = (df2['host_has_profile_pic'] == 't')
df2['host_identity_verified'] = (df2['host_identity_verified'] == 't')
df2['requires_license'] = (df2['requires_license'] == 't')
df2['instant_bookable'] = (df2['instant_bookable'] == 't')
df2['is_business_travel_ready'] = (df2['is_business_travel_ready'] == 't')
df2['require_guest_phone_verification'] = (df2['require_guest_phone_verification'] == 't')

In [173]:
df2['host_is_superhost'] = (df2['host_is_superhost']*1)
df2['host_has_profile_pic'] = (df2['host_has_profile_pic']*1)
df2['host_identity_verified'] = (df2['host_identity_verified']*1)
df2['requires_license'] = (df2['requires_license']*1)
df2['instant_bookable'] = (df2['instant_bookable']*1)
df2['is_business_travel_ready'] = (df2['is_business_travel_ready']*1)
df2['require_guest_phone_verification'] = (df2['require_guest_phone_verification']*1)

In [174]:
df2

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,city,state,zipcode,market,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,guests_included,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_location,review_scores_value,requires_license,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_phone_verification,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,filename,name_len,summary_len,space_len,description_len,neighborhood_overview_len,notes_len,transit_len,access_len,interaction_len,house_rules_len,host_about_len
0,,0.0,0,1,1.0,"['email', 'phone', 'facebook', 'reviews', 'off...",1,0,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,House,Private room,2.0,1.0,1.0,2.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",0.0,50.0,1.0,1.0,3.0,0,0,96.0,10.0,10.0,10.0,10.0,10.0,0,0,0,moderate,1,1.0,0.0,1.18,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,47,472,1000,1000,810,425,53,524,354,1000,1377
1,within an hour,1.0,0,1,14.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",1,0,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Loft,Entire home/apt,12.0,2.0,3.0,6.0,Real Bed,"{TV,Internet,Wifi,""Air conditioning"",Kitchen,""...",0.0,765.0,6.0,1.0,100.0,0,0,96.0,10.0,10.0,10.0,10.0,9.0,0,1,0,super_strict_60,0,0.0,0.0,1.01,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,16,489,876,1000,364,87,107,3,98,1000,108
2,within an hour,1.0,0,0,2.0,"['email', 'phone', 'facebook', 'reviews', 'off...",1,1,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Apartment,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,"{Wifi,""Air conditioning"",Kitchen,""Free parking...",0.0,75.0,2.0,30.0,365.0,0,0,90.0,9.0,9.0,10.0,10.0,9.0,0,0,0,strict_14_with_grace_period,0,0.0,0.0,0.84,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,50,491,1000,1000,281,3,190,82,324,181,116
3,within an hour,1.0,0,1,7.0,"['email', 'phone', 'facebook', 'reviews', 'off...",1,0,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,Guesthouse,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",0.0,107.0,2.0,1.0,365.0,0,0,90.0,10.0,9.0,10.0,10.0,9.0,0,1,0,moderate,0,2.0,4.0,2.23,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,34,3,467,967,192,72,77,83,71,166,221
4,within an hour,1.0,0,1,7.0,"['email', 'phone', 'facebook', 'reviews', 'off...",1,0,"Asheville, NC, United States",,Asheville,NC,2880,North Carolina Mountains,House,Private room,2.0,2.5,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Free parking on premis...",0.0,71.0,2.0,1.0,365.0,0,0,90.0,10.0,9.0,10.0,10.0,9.0,0,1,0,moderate,0,2.0,4.0,0.63,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,33,3,570,682,72,3,38,3,3,143,221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243681,within an hour,1.0,0,0,16.0,"['phone', 'reviews', 'jumio', 'offline_governm...",1,0,"Washington, DC, United States",Woodridge,Washington,DC,2001,D.C.,House,Entire home/apt,6.0,2.5,2.0,2.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",0.0,150.0,1.0,1.0,1125.0,0,0,,,,,,,0,1,0,flexible,0,2.0,0.0,,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,43,458,557,1000,238,3,729,41,91,276,415
243682,,0.0,0,0,0.0,"['email', 'phone']",1,0,"Washington, DC, United States",Brentwood,Washington,DC,2001,D.C.,Guest suite,Entire home/apt,2.0,1.0,0.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",0.0,55.0,1.0,1.0,1125.0,0,0,,,,,,,0,1,0,flexible,0,0.0,0.0,,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,32,398,147,918,117,3,168,3,84,3,262
243683,within an hour,1.0,0,0,5.0,"['email', 'phone', 'facebook', 'reviews', 'wor...",1,0,"Washington, DC, United States",Trinidad,Washington,DC,2000,D.C.,Guest suite,Entire home/apt,1.0,1.5,1.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,Heating,""F...",0.0,50.0,1.0,1.0,14.0,0,0,,,,,,,0,1,0,flexible,0,2.0,0.0,,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,17,43,54,238,68,3,22,3,47,3,77
243684,,0.0,0,0,0.0,"['email', 'phone', 'offline_government_id', 's...",1,0,"Washington, DC, United States",U Street Corridor,Washington,DC,2000,D.C.,Condominium,Entire home/apt,2.0,2.0,1.0,1.0,Real Bed,"{Wifi,""Air conditioning"",Kitchen,""Pets allowed...",0.0,120.0,1.0,90.0,1125.0,0,0,,,,,,,0,0,0,flexible,0,0.0,0.0,,/Users/jasimrashid/Projects/Datasets/airbnb-1/...,47,470,466,1000,1000,3,400,3,313,72,1015


In [175]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243686 entries, 0 to 243685
Data columns (total 55 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   host_response_time                            172462 non-null  object 
 1   host_response_rate                            243686 non-null  float64
 2   host_acceptance_rate                          243686 non-null  int64  
 3   host_is_superhost                             243686 non-null  int64  
 4   host_total_listings_count                     243423 non-null  float64
 5   host_verifications                            243679 non-null  object 
 6   host_has_profile_pic                          243686 non-null  int64  
 7   host_identity_verified                        243686 non-null  int64  
 8   street                                        243679 non-null  object 
 9   neighbourhood                                 19

### Transform the Data


### Create the Feature matrix and Target vector(**DONE**)

In [176]:
x = df2.drop(columns='price')
y = df2['price']
x3 = x[['zipcode', 'city', 'bathrooms', 'bedrooms', 'beds', 'state', 'property_type', 'square_feet', 'neighbourhood',
        'number_of_reviews', 'review_scores_rating', 'instant_bookable', 'guests_included', 'is_business_travel_ready', 'cancellation_policy',
        'transit_len', 'description_len', 'host_about_len', 'accommodates']].copy()

### Column Transformer with mixed types

In [177]:
x3.head()

Unnamed: 0,zipcode,city,bathrooms,bedrooms,beds,state,property_type,square_feet,neighbourhood,number_of_reviews,review_scores_rating,instant_bookable,guests_included,is_business_travel_ready,cancellation_policy,transit_len,description_len,host_about_len,accommodates
0,2880,Asheville,1.0,1.0,2.0,NC,House,0.0,,0,96.0,0,1.0,0,moderate,53,1000,1377,2.0
1,2880,Asheville,2.0,3.0,6.0,NC,Loft,0.0,,0,96.0,1,6.0,0,super_strict_60,107,1000,108,12.0
2,2880,Asheville,1.0,1.0,1.0,NC,Apartment,0.0,,0,90.0,0,2.0,0,strict_14_with_grace_period,190,1000,116,2.0
3,2880,Asheville,1.0,1.0,1.0,NC,Guesthouse,0.0,,0,90.0,1,2.0,0,moderate,77,967,221,2.0
4,2880,Asheville,2.5,1.0,1.0,NC,House,0.0,,0,90.0,1,2.0,0,moderate,38,682,221,2.0


In [178]:
numeric_features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet', 'number_of_reviews', 'review_scores_rating',
                     'guests_included', 'transit_len', 'description_len', 'host_about_len']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['zipcode', 'city', 'state', 'property_type', 'neighbourhood', 'instant_bookable',
                         'is_business_travel_ready', 'cancellation_policy']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [179]:
X_train, X_test, y_train, y_test = train_test_split(x3, y, test_size=0.2)

In [180]:
X_tra = preprocessor.fit_transform(X_train)

In [181]:
X_tes = preprocessor.fit_transform(X_test)

In [182]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [187]:
model = keras.Sequential()
model.add(layers.Dense(64, activation="relu", input_shape=[19]))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(1, activation='linear'))

optimizer = tf.keras.optimizers.RMSprop(0.001)

model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['mae', 'mse'])

In [188]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 64)                1280      
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 5,505
Trainable params: 5,505
Non-trainable params: 0
_________________________________________________________________


In [189]:
print(type(X_tra))
print(type(X_tes))
print(type(y_train))
print(type(y_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [190]:
model.fit(X_tra, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9d96b73470>

In [191]:
preds = model.predict(X_tes)

In [192]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [197]:
type(y_test)

pandas.core.series.Series

In [198]:
type(preds)

numpy.ndarray

In [193]:
mae(y_test, preds)

158.76588275718368

In [194]:
r2(y_test, preds)

0.06961478821716482

In [195]:
X_tes[0].shape

(19,)

In [196]:
model.predict(X_tes[[0]])

array([[176.37329]], dtype=float32)

## Save and load