In [1]:
# Dataframe
import numpy as np
import pandas as pd
from math import sqrt

# Preprocessing
import category_encoders as ce

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)  
import plotly.figure_factory as ff
sns.set_style("whitegrid")

# Outlier Detection
from scipy import stats

# Sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.inspection import permutation_importance

# Regression model
from catboost import Pool, CatBoostRegressor, cv
from catboost.utils import eval_metric



In [2]:
df = pd.read_csv('listings.csv')

In [3]:
null_value_stats = df.isnull().sum()
null_value_stats[null_value_stats != 0]

name                    2
last_review          2758
reviews_per_month    2758
dtype: int64

In [4]:


# Unecessary features to predict price
df.drop(['name','id','host_name', 'host_id', 'last_review'], inplace=True, axis=1)

# Impute missing values to 0
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)



In [5]:


# Specify all numeric features for outlier detection
numeric_features = ['latitude','longitude','price',
                    'minimum_nights','number_of_reviews',
                    'calculated_host_listings_count','availability_365']



In [6]:


# Remove outlier from target value (price)
outlier = (np.abs(stats.zscore(df["price"]))<0.7)
outlier_ix = np.where(outlier==False)
df.drop(index=outlier_ix[0], inplace=True)



In [7]:


outliers = [] 

# Check outliers using zscore
for lat in df['latitude']:
    zscore = (lat - np.mean(df['latitude'])) / np.std(df['latitude'])
    if zscore > 3:
        outliers.append(lat)
        
print(len(outliers))



171


In [8]:
df_cleaned = df.replace(outliers, np.median(df['latitude']))

In [9]:
df_cleaned = df[df['minimum_nights'] <= 365]

In [10]:
room_dict ={
    'Entire home/apt': 1,
    'Private room': 2,
    'Shared room': 3
}

df_cleaned['room_type'] = df_cleaned['room_type'].map(room_dict)

In [11]:


# Binary encoding for neighbourhood
binary = ce.BinaryEncoder(cols=['neighbourhood'])
df_cleaned = binary.fit_transform(df_cleaned)




is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [40]:


# Specify features and target values
X = df_cleaned.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15]].values
y = df_cleaned.iloc[:,11:12].values



In [41]:
label = LabelEncoder()
X[:,0] = label.fit_transform(X[:,0])

In [42]:


# Split model into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Utilize pooling from Catboost
train_pool = Pool(X_train, y_train) 
test_pool = Pool(X_test, y_test)



In [43]:


# Specify hyperparameters for the model
params = {
    'iterations': 6000,
    'learning_rate': 0.002,
    'random_seed': 42,
    'logging_level': 'Silent',
    'early_stopping_rounds': 500
}



In [44]:


# Define regression model using the specified hyperparameters
model = CatBoostRegressor(**params)



In [45]:


# Train the model and check plot its training data
model.fit(
    train_pool, 
    eval_set=test_pool, 
    verbose=False, 
    plot=True
)



Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1ea405a1a60>

In [47]:
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("MAE: " + str(mae))
print("RMSE: " + str(rmse))
print("MSE: " + str(mse))
print("R2: " + str(r2))

MAE: 38.32246931980263
RMSE: 53.30700611276851
MSE: 2841.6369007067397
R2: 0.6102152219767978
