In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopy
from geopy.distance import vincenty
import sklearn
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost as xgb
from xgboost import plot_importance

%matplotlib inline

In [2]:
df = pd.read_pickle('processed_data.pkl')

In [3]:
df.head()

Unnamed: 0,listing_url,id,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,neighbourhood_cleansed,latitude,longitude,property_type,...,Free parking on premises,Wide hallways,Other pet(s),Host greets you,lat_long,palace_dist,ginza_dist,temple_dist,park_dist,skytree_dist
0,https://www.airbnb.com/rooms/197677,197677,within a day,100.0,1,Sumida District,Sumida Ku,35.71721,139.82596,Apartment,...,0,0,0,0,"(35.717209999999994, 139.82596)",7513.579496,7417.810752,2661.08476,4696.409605,1590.312028
1,https://www.airbnb.com/rooms/776070,776070,within an hour,100.0,1,Kita District,Arakawa Ku,35.73818,139.77009,House,...,0,0,0,0,"(35.73818, 139.77008999999998)",6082.978847,7438.766049,3539.136814,2542.453677,4817.368015
2,https://www.airbnb.com/rooms/905944,905944,within an hour,95.0,1,Shibuya District,Shibuya Ku,35.67968,139.67949,Apartment,...,0,0,0,0,"(35.67968, 139.67949)",6664.499962,7933.153958,11301.364119,9440.211966,12346.096121
3,https://www.airbnb.com/rooms/1016831,1016831,within a few hours,100.0,1,Shimokitazawa,Setagaya Ku,35.65833,139.67153,House,...,0,0,0,0,"(35.65833, 139.67153000000002)",7938.89733,8716.697715,12947.277243,11244.471333,13845.629774
4,https://www.airbnb.com/rooms/1096292,1096292,within a day,100.0,1,Shinjuku District,Shinjuku Ku,35.69098,139.70618,House,...,0,0,0,1,"(35.690979999999996, 139.70618000000002)",4268.373312,5885.015482,8607.893109,6721.937428,9694.372231


## Model Preparation

In [4]:
model1 = df[(df['price'] < 150) & (df['accommodates'] <= 10)]

In [5]:
drop_cols = ['listing_url', 'id', 'host_response_time', 'neighbourhood_cleansed',
             'amenities', 'availability_30', 'availability_365', 'has_availability',
             'last_scraped', 'last_review', 'price', 'booked_days', 'monthly_revenue', 'lat_long'
            ]

X = model1.copy()
y = X['price']
X.drop(columns=drop_cols, inplace=True)
final_cols = X.columns.tolist()
#X.drop(X.columns[[29]], axis=1, inplace=True)
X = pd.get_dummies(X, drop_first=True)

In [6]:
drop_feats2 = ['beds',
 'review_scores_accuracy',
 'review_scores_communication',
 'num_amenities',
 'Air purifier',
 'Toilet paper',
 'Fireplace guards',
 'Soaking tub',
 'Cable TV',
 'Lockbox',
 'Table corner guards',
 'Baby monitor',
 'No stairs or steps to enter',
 'Luggage dropoff allowed',
 'Dog(s)',
 'Wide hallways',
 'Fixed grab bars for shower',
 'First aid kit',
 'Flat path to guest entrance',
 'Wide entrance',
 'Bedroom comforts',
 'Cat(s)',
 'Air conditioning',
 'Microwave',
 'Walk-in shower',
 'Hot water kettle',
 'Free street parking',
 'Suitable for events',
 'Bidet',
 'Netflix',
 'Pets live on this property',
 'Terrace',
 'Other pet(s)',
 'Accessible-height toilet',
 'Lake access',
 'Bathroom essentials',
 'Balcony',
 'Carbon monoxide detector',
 'Ground floor access',
 'Beachfront',
 'Hair dryer',
 'Crib',
 'EV charger',
 'Wheelchair accessible',
 'Stair gates',
 'Memory foam mattress',
 'Kitchenette',
 'Family/kid friendly',
 'Private living room',
 'Extra space around bed',
 'Wide doorway to guest bathroom',
 'Breakfast table',
 'Dishwasher',
 'Bathtub with bath chair',
 'Roll-in shower',
 'Smart TV',
 'Cooking basics',
 'Children’s dinnerware',
 'Mobile hoist',
 'Essentials',
 'Well-lit path to entrance',
 'Changing table',
 'temple_dist',
 'park_dist',
 'skytree_dist',
 'host_neighbourhood_Akasaka',
 'host_neighbourhood_Chiyoda District',
 'host_neighbourhood_Chūō District',
 'host_neighbourhood_Edogawa District',
 'host_neighbourhood_Ikuno-ku',
 'host_neighbourhood_Kita District',
 'host_neighbourhood_Kita-ku',
 'host_neighbourhood_Minami-ku',
 'host_neighbourhood_Nakameguro',
 'host_neighbourhood_Nakano District',
 'host_neighbourhood_Nanfang Shangcheng',
 'host_neighbourhood_Nishi-ku',
 'host_neighbourhood_Setagaya District',
 'host_neighbourhood_Shinagawa District',
 'host_neighbourhood_Soho',
 'host_neighbourhood_Toshima District',
 'property_type_Dome house',
 'property_type_Hotel',
 'property_type_Other',
 'bed_type_Futon',
 'bed_type_Pull-out Sofa',
 'bed_type_Real Bed', 
 'host_response_rate', 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'reviews_per_month',
 'neg_score',
 'neu_score',
 'since_last_review',
 'palace_dist',
 'ginza_dist',
 'temple_dist',
 'park_dist',
 'skytree_dist',
  'latitude', 'longitude', 'pos_score']

In [7]:
X.drop(columns=drop_feats2, inplace=True)
X.drop(X.columns[[6]], axis=1, inplace=True)

In [8]:
cols = [x.lower() for x in X.columns.tolist()]
X.columns = cols

In [9]:
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)

In [10]:
print(f' X_train shape : {X_train.shape}')
print(f' X_val shape : {X_val.shape}')
print(f' X_test shape : {X_test.shape}')

 X_train shape : (2544, 123)
 X_val shape : (848, 123)
 X_test shape : (849, 123)


## OLS