# Scenario
**Chicago Airbnb**

You and a group of friends are
considering purchasing a property in
Chicago that you can use as an
investment. You have heard from other
people that they have made a lot of
money by renting out either a room or
an entire unit (apartment or house). Your
friends ask you to analyze data so that
they can understand how much you
would charge per night based on the
type of dwelling you were to purchase.

Time to test the model and optimization of the price on the 100 samples set aside at the beginning of the project.

# Imports

In [57]:
import numpy as np
import pandas as pd
import joblib
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sqlalchemy import create_engine
from tqdm import tqdm
tqdm.pandas()

import utils

# Load the data

In [58]:
raw_test = pd.read_csv('live_test_data.csv')
raw_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              100 non-null    int64  
 1   name                            100 non-null    object 
 2   host_id                         100 non-null    int64  
 3   host_name                       100 non-null    object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   100 non-null    object 
 6   latitude                        100 non-null    float64
 7   longitude                       100 non-null    float64
 8   room_type                       100 non-null    object 
 9   price                           100 non-null    int64  
 10  minimum_nights                  100 non-null    int64  
 11  number_of_reviews               100 non-null    int64  
 12  last_review                     82 no

# Data Preparation

In [59]:
df = utils.prepare_data(raw_test)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97 entries, 0 to 99
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   neighbourhood                 97 non-null     category
 1   latitude                      97 non-null     float64 
 2   longitude                     97 non-null     float64 
 3   room_type                     97 non-null     category
 4   price                         97 non-null     int64   
 5   log_days_since_last_review    97 non-null     float64 
 6   log_reviews_per_month         97 non-null     float64 
 7   log_number_of_reviews         97 non-null     float64 
 8   log_price                     97 non-null     float64 
 9   log_minimum_nights            97 non-null     float64 
 10  log_host_listings_count       97 non-null     float64 
 11  log_nights_booked             97 non-null     float64 
 12  minimum_booking_price         97 non-null     float64 
 1

# Target and Features

In [60]:
X, y = df.drop(columns='log_nights_booked'), df['log_nights_booked']

# Test the model

In [61]:
mse = mean_squared_error(y, utils.pipe.predict(X))
mae = mean_absolute_error(y, utils.pipe.predict(X))
r2 = r2_score(y, utils.pipe.predict(X))
print(f'MSE: {mse}, MAE: {mae}, R2: {r2}')

MSE: 0.5484448301026602, MAE: 0.504924145044974, R2: 0.43924989442447004


# Optimize the price

In [62]:
dropped = raw_test[~raw_test.index.isin(df.index)]
dropped

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,log_days_since_last_review,log_reviews_per_month,log_number_of_reviews
43,39551455,Luxurious Private Suite with Lake View,303686027,Viceroy,Near North Side,41.90148,-87.62849,Private room,595,1,0,NaT,0.0,9,72,-1.0,-1.0,-1.0
55,36814026,"The Blue Room: Queer Host, Dog, Biker friendly!",25901089,Jaq,Albany Park,41.97198,-87.7188,Private room,23,2,7,2020-08-29,0.5,2,122,2.100715,-0.221849,0.851258
77,1755737,"Lake View 2BR/2Bath w/ Pool, Gym. Annual lease.",9233414,Ravi,Lake View,41.94036,-87.64068,Entire home/apt,155,365,0,NaT,0.0,1,363,-1.0,-1.0,-1.0


In [56]:
raw_test['best_revenue'], raw_test['best_price'], raw_test['best_bookings'] = zip(*X.progress_apply(utils.optimize_income, axis=1))
raw_test

 45%|████▌     | 44/97 [00:11<00:13,  3.85it/s]


KeyboardInterrupt: 