# Capstone Workbook 4: Initial Modelling

In [1]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import data 
airbnb_ldn = pd.read_csv('airbnb_ldn_pp.csv')

In [3]:
# drop 'Unnamed: 0'
airbnb_ldn = airbnb_ldn.drop(columns = 'Unnamed: 0')

In [4]:
airbnb_ldn.isnull().sum()

Listing Title                                            4
Property Type                                            0
City                                                     0
Zipcode                                                  0
Number of Reviews                                        0
Bedrooms                                                 0
Bathrooms                                                0
Max Guests                                               0
Airbnb Superhost                                         0
Cleaning Fee (Native)                                    0
Extra People Fee(Native)                                 0
Check-in Time                                         1757
Minimum Stay                                             0
Latitude                                                 0
Longitude                                                0
Overall Rating                                           0
Airbnb Communication Rating                             

In [5]:
# View data:
airbnb_ldn.head().T

Unnamed: 0,0,1,2,3,4
Listing Title,Cozy 2BR house with a garden view,GuestReady - Amazing home with a private garden,Cosy cottage on Richmond Park,"Entire Flat. Free parking, Garden , Richmond park",Maisonette inbetween Richmond Park and Wimbledon
Property Type,Entire home,Entire home,Entire home,Entire rental unit,Private room in rental unit
City,Greater London,Greater London,Greater London,Greater London,Greater London
Zipcode,SW15 3,SW15 3,SW15 3,SW15 3,SW15 3
Number of Reviews,9,11,1,20,0
Bedrooms,2.0,2.0,1.0,2.0,1.0
Bathrooms,2,1,2,1,1
Max Guests,6,4,3,4,2
Airbnb Superhost,0,1,0,0,0
Cleaning Fee (Native),154.8,0.0,0.0,34.8,0.0


The data has now been cleaned, had some initial EDA completed and been preprocessed. 

Some initial models will now be built, starting with a regression model with a L1 penalty. This will help identify which columns are influencial in predicting the target column.

First, the dataframe will be split into the independent and target variables, using just numerical variables for now:

In [6]:
X = airbnb_ldn.select_dtypes(exclude='object').drop(columns = ['Annual Revenue LTM (Native)', 'Host Listing Count'])
y = airbnb_ldn['Annual Revenue LTM (Native)']

In [7]:
# check for additional nulls that will impact the model:
X.isnull().any().any()

False

In [8]:
X.shape

(32678, 39)

The dependent and target variables will now be split into into train and test sets:

In [9]:
# download required sklearn packages:
from sklearn.model_selection import train_test_split

# splitting data into train and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [10]:
print(X_train.shape)
print(y_train.shape)

(21894, 39)
(21894,)


As random samples have been taken, the indexes for the dataframes will be reset:

In [11]:
# reset index for X training set:
X_train.reset_index(inplace=True)

# drop created 'index' column:
X_train.drop(columns='index', inplace=True)

# reset index for X_test:
X_test.reset_index(drop=True, inplace=True)

The index for the target column must also be reset:

In [12]:
# complete the same tranformation for the y_test data:
# convert the series to a dataframe:
y_train = y_train.to_frame()

# reset index
y_train.reset_index(drop=True, inplace=True)

# return column to series
y_train = y_train.squeeze()

Complete the same tranformation for the y_test data:

In [13]:
# convert the series to a dataframe:
y_test = y_test.to_frame()

# reset index
y_test.reset_index(drop=True, inplace=True)

# return column to series
y_test = y_test.squeeze()

Now the train and test datasets have been transferred, the first model can be made:

In [14]:
# import required libraries
from scipy import stats
import statsmodels.api as sm

In [15]:
# initiallly manually add the y-intercept:
X_train_withconstant = sm.add_constant(X_train)
X_test_withconstant = sm.add_constant(X_test)

In [16]:
# 1. instantiate model
myregression = sm.OLS(y_train, X_train_withconstant)

# fit model
myregression_results = myregression.fit()

# Looking at the summary
myregression_results.summary()

0,1,2,3
Dep. Variable:,Annual Revenue LTM (Native),R-squared:,0.695
Model:,OLS,Adj. R-squared:,0.695
Method:,Least Squares,F-statistic:,1426.0
Date:,"Tue, 26 Mar 2024",Prob (F-statistic):,0.0
Time:,16:26:17,Log-Likelihood:,-235780.0
No. Observations:,21894,AIC:,471600.0
Df Residuals:,21858,BIC:,471900.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8233,1.508,-0.546,0.585,-3.779,2.133
Number of Reviews,-21.2346,1.619,-13.118,0.000,-24.408,-18.062
Bedrooms,-378.4004,154.932,-2.442,0.015,-682.078,-74.723
Bathrooms,207.9883,141.423,1.471,0.141,-69.211,485.188
Max Guests,727.1200,70.256,10.350,0.000,589.414,864.826
Airbnb Superhost,467.5518,194.784,2.400,0.016,85.760,849.343
Cleaning Fee (Native),39.7579,2.474,16.068,0.000,34.908,44.608
Extra People Fee(Native),-22.0280,10.424,-2.113,0.035,-42.459,-1.597
Minimum Stay,-8.3228,4.305,-1.933,0.053,-16.761,0.115

0,1,2,3
Omnibus:,12890.177,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2422215.993
Skew:,1.765,Prob(JB):,0.0
Kurtosis:,54.408,Cond. No.,1.42e+19


The first model has produced fairly positive results. An R^2 value of 0.692 indicates that the approximately 70% of the variance within the Annual Revenue can be explained by the various numerical features.

Let see how this model works with the test data:

A second more advanced linear model will now be produced. This second model will use an L1 penalty.

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
# scale the train and test variables:
X_train_scaled = StandardScaler().fit_transform(X_train_withconstant)
X_test_scaled = StandardScaler().fit_transform(X_test_withconstant)

In [19]:
# Instantiate the model:
linreg = LinearRegression()

# Fit the model:
linreg.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = linreg.predict(X_test_scaled)

# Determine the mean squared error:
mse = mean_squared_error(y_test, y_pred)

# Determine the r2 value:
r2 = r2_score(y_test, y_pred)

# Viewing the various independent variables coefficients:
coef = linreg.coef_
feature_names = X_test_withconstant.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coef})

print(f'The R^2 value for the initial model is: {round(r2, 3)}')

The R^2 value for the initial model is: 0.704


In [20]:
X_test_withconstant

Unnamed: 0,const,Number of Reviews,Bedrooms,Bathrooms,Max Guests,Airbnb Superhost,Cleaning Fee (Native),Extra People Fee(Native),Minimum Stay,Latitude,...,Checkout Time_afternoon,Checkout Time_evening,Checkout Time_late,Checkout Time_morning,Checkout Time_none,Checkout Time_very_early,Listing Type_entire_home,Listing Type_hotel_room,Listing Type_private_room,Listing Type_shared_room
0,1.0,21,3.0,2,5,0,0.0,0.0,4,51.476405,...,0,0,0,1,0,0,1,0,0,0
1,1.0,11,1.0,1,2,0,0.0,0.0,3,51.512630,...,0,0,0,1,0,0,1,0,0,0
2,1.0,82,2.0,2,4,0,51.4,0.0,3,51.513000,...,0,0,0,1,0,0,1,0,0,0
3,1.0,21,1.0,1,2,1,22.9,0.0,1,51.584150,...,0,0,0,1,0,0,0,0,1,0
4,1.0,10,1.0,1,3,0,0.0,0.0,3,51.524870,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10779,1.0,21,3.0,3,6,0,160.4,21.4,5,51.519270,...,0,0,0,1,0,0,1,0,0,0
10780,1.0,12,5.0,4,14,0,0.0,0.0,2,51.526010,...,0,0,0,1,0,0,1,0,0,0
10781,1.0,1,1.0,1,2,0,0.0,0.0,1,51.542240,...,0,0,0,1,0,0,0,0,1,0
10782,1.0,2,2.0,2,3,0,0.0,0.0,3,51.488220,...,0,0,0,0,1,0,1,0,0,0


In [21]:
r2

0.7035428986452991

In [22]:
# Viewing the various independent variables coefficients:
coef = linreg.coef_
feature_names = X_test_withconstant.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coef})

print(coef_df)

                                              Feature   Coefficient
0                                               const -4.763324e-15
1                                   Number of Reviews -1.289356e+03
2                                            Bedrooms -3.444098e+02
3                                           Bathrooms  1.463764e+02
4                                          Max Guests  1.480008e+03
5                                    Airbnb Superhost  2.014410e+02
6                               Cleaning Fee (Native)  1.661042e+03
7                            Extra People Fee(Native) -1.745921e+02
8                                        Minimum Stay -1.522199e+02
9                                            Latitude  3.621114e+01
10                                          Longitude -7.167789e+01
11                                     Overall Rating -6.401305e+01
12                        Airbnb Communication Rating  6.078184e+01
13                             Airbnb Accuracy R

The above coefficient output shows the range of coefficients of the different indepedent variables in predicting the target variable. This initial model gives a good first insight into which variables are particularly influencial in determining and higher annual revenue, and which are detrimental.

The results will be filtered to show the most influence, both positvely and negatively.

In [23]:
# Looking at the variables with positive coefficients:
coef_df[coef_df['Coefficient'] > 0]

Unnamed: 0,Feature,Coefficient
3,Bathrooms,146.376404
4,Max Guests,1480.00838
5,Airbnb Superhost,201.440968
6,Cleaning Fee (Native),1661.042028
9,Latitude,36.211142
12,Airbnb Communication Rating,60.781836
13,Airbnb Accuracy Rating,25.692461
15,Airbnb Location Rating,590.073195
18,Pets Allowed,147.84966
19,Count Available Days LTM,2775.312987


The above output shows the different numerical independent variables that have a positive correlation with the annual revenue. There are several indepedent varibles, such as 'Occupancy Rate LTM' and 'Average Daily Rate' whose high coefficient is expected; a it would be strange for unoccupied properties or properties that charge little nightly rates to make significant amounts money.

Some useful insights from this first model, of the different rating sub-categories the 'Location Rating' has the highest coefficient. This is suggests that the location is the biggest determining factor amoung the ratin categories for predicting a high revenue. Confirming that location is a very important component when customers are choosing which properties to rent and what price they are willing to pay. Also, of the listing types 'Entire home' has a very high positive correlation, with all the other listing types having a negative correlation. It can be assumed that on the whole, entire places are going to be far more expensive than shared/individual rooms. However, the fact the coefficioent for the entire places is positive and very large, whereas the other listing types have negative coefficients, suggests that customers are happy to pay a significantly larger sum for entire places than the general trend of listing types would predict.

In [24]:
# Looking at the variables with negative coefficients:
coef_df[coef_df['Coefficient']<0]

Unnamed: 0,Feature,Coefficient
0,const,-4.763324e-15
1,Number of Reviews,-1289.356
2,Bedrooms,-344.4098
7,Extra People Fee(Native),-174.5921
8,Minimum Stay,-152.2199
10,Longitude,-71.67789
11,Overall Rating,-64.01305
14,Airbnb Checkin Rating,-146.0486
16,Airbnb Value Rating,-416.5841
17,Airbnb Host ID,-11.0816


Looking at the negative coefficient variables, there are sum suprising inclusions, such as 'Overall Rating' and 'Bedrooms'. Some variables are more immediately apparently, such as 'Minimum Stay', as any customers looking to rent less than the 'Minimum Stay' amount will automatically look else where, with the nature of Airbnb being mainly vacation/short term customers, having a higher minimum stay amount will instantly reduce the amount of applicable customers by a substantial amount, thus reducing annual revenue potential. 

A second Linear Regression model will now be complete. This model will use a L1 penatly, which will alter the coefficients and enable more influence numeric indepedent factors to be identified with ease.

In [25]:
# Import Lasso model:
from sklearn.linear_model import Lasso

In [26]:
# Using the same scaled data that was using the previous model, with the same train test split:

# Assign alpha for regulization strength
alpha = 1
max_iter = 10000
lasso_linreg = Lasso(alpha=alpha, max_iter=max_iter)

# Fit model:
lasso_linreg.fit(X_train_withconstant, y_train)

In [27]:
# Predict model
y_pred = lasso_linreg.predict(X_test_scaled)

r2_2 = r2_score(y_test, y_pred)



In [28]:
# Viewing the various independent variables coefficients:
coef_L1 = lasso_linreg.coef_
feature_names = X_test_withconstant.columns

coef_L1_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coef_L1})

display(coef_L1_df)

Unnamed: 0,Feature,Coefficient
0,const,0.0
1,Number of Reviews,-21.2469
2,Bedrooms,-373.4599
3,Bathrooms,199.0358
4,Max Guests,724.7708
5,Airbnb Superhost,459.0836
6,Cleaning Fee (Native),39.77597
7,Extra People Fee(Native),-22.04653
8,Minimum Stay,-8.309004
9,Latitude,269.1339


In [29]:
# Check which columns have a Coefficient of 0, when using L1 penalty:
coef_L1_df[coef_L1_df['Coefficient'] == 0]

Unnamed: 0,Feature,Coefficient
0,const,0.0
20,Count Blocked Days LTM,-0.0
31,Checkout Time_evening,0.0
32,Checkout Time_late,0.0
34,Checkout Time_none,0.0


These columns can be dropped for future modelling. A 'Linear Regression' function will be made, to improve modelling efficiency:

In [30]:
X

Unnamed: 0,Number of Reviews,Bedrooms,Bathrooms,Max Guests,Airbnb Superhost,Cleaning Fee (Native),Extra People Fee(Native),Minimum Stay,Latitude,Longitude,...,Checkout Time_afternoon,Checkout Time_evening,Checkout Time_late,Checkout Time_morning,Checkout Time_none,Checkout Time_very_early,Listing Type_entire_home,Listing Type_hotel_room,Listing Type_private_room,Listing Type_shared_room
0,9,2.0,2,6,0,154.8,0.0,3,51.43105,-0.26074,...,0,0,0,1,0,0,1,0,0,0
1,11,2.0,1,4,1,0.0,0.0,1,51.43399,-0.25656,...,0,0,0,1,0,0,1,0,0,0
2,1,1.0,2,3,0,0.0,0.0,7,51.43500,-0.25700,...,0,0,0,1,0,0,1,0,0,0
3,20,2.0,1,4,0,34.8,2.5,5,51.43531,-0.25648,...,0,0,0,1,0,0,1,0,0,0
4,0,1.0,1,2,0,0.0,0.0,5,51.43532,-0.25413,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32673,2,1.0,2,2,0,26.9,0.0,3,51.51206,0.12887,...,1,0,0,0,0,0,1,0,0,0
32674,12,1.0,1,2,0,15.8,0.0,1,51.51173,0.13129,...,1,0,0,0,0,0,0,0,1,0
32675,11,1.0,1,2,0,0.0,0.0,2,51.63000,0.01700,...,0,0,0,1,0,0,0,0,1,0
32676,6,1.0,1,2,0,10.3,10.2,3,51.62981,0.01805,...,0,0,0,1,0,0,0,0,1,0


In [31]:
# Log scale all columns, add 1 to avoid taking the log of 0
num_cols_log = np.log(X.drop(columns = ['Longitude', 'Latitude']) + 1)

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [32]:
# drop zero coefficient columns:
X = X.drop(columns = ['Count Blocked Days LTM', 'Checkout Time_evening', 'Checkout Time_late', 'Checkout Time_none'])

In [33]:
# create linear regression function:
def Linear_Regression(X, y):

    # splitting data into train and test set:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

    # reset index for X training set:
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)

    # complete the same tranformation for the y_test data:
    # convert the series to a dataframe:
    y_train = y_train.to_frame()

    # reset index
    y_train.reset_index(drop='index', inplace=True)

    # return column to series
    y_train = y_train.squeeze()

    # convert the series to a dataframe:
    y_test = y_test.to_frame()

    # reset index
    y_test.reset_index(drop=True, inplace=True)

    # return column to series
    y_test = y_test.squeeze()

    # Add constants to the train and test X dataframes:
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Scale the train and test X dataframes:
    X_train_ss = StandardScaler().fit_transform(X_train)
    X_test_ss = StandardScaler().fit_transform(X_test)

    # Creating the Linear Regression Model

    # Instantiate the model
    linreg = LinearRegression()

    # Fit the model:
    linreg.fit(X_train_ss, y_train)

    # Evaluate the model:
    y_pred = linreg.predict(X_test_ss)

    # Determine the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"The MSE value is:{mse}")

    # Find the R2 value:
    r2 = r2_score(y_test, y_pred)
    print(f"The r2 values are: {r2}")

    # Viewing the various independent variable coefficients:
    coef = linreg.coef_
    feature_names = X_test.columns

    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coef})
    return(coef_df)


Additional columns from the variable components will be dropped. Several columns that can be deemed semantically less important to predicting the annual revenue will be dropped. Some columns that are too influencial will also be dropped, such as some of the columns to do with the properties occupancy rate and nightly rate.

In [34]:
Linear_Regression(X, y)

The MSE value is:124965633.15320216
The r2 values are: 0.7035380339008895


Unnamed: 0,Feature,Coefficient
0,const,-3.21547e-15
1,Number of Reviews,-1289.536
2,Bedrooms,-344.9514
3,Bathrooms,146.5313
4,Max Guests,1480.172
5,Airbnb Superhost,201.6556
6,Cleaning Fee (Native),1660.624
7,Extra People Fee(Native),-174.7819
8,Minimum Stay,-152.2785
9,Latitude,36.07614


In [35]:
X_opti = X[['Number of Reviews', 'Bedrooms', 'Bathrooms', 'Airbnb Superhost', 'Minimum Stay', 'Listing Type_entire_home', 'Listing Type_hotel_room', 'Airbnb Location Rating', 'Minimum Stay',
            'Airbnb Superhost', 'Extra People Fee(Native)', 'Cancellation Policy_no_policy', 'Cancellation Policy_medium', 'Pets Allowed', 'Overall Rating', 'Occupancy Rate LTM']]

In [36]:
X_2 = X.drop(columns = ['Longitude', 'Latitude'])

In [37]:
Linear_Regression(X_opti, y)

The MSE value is:309819134.02160186
The r2 values are: 0.2650012064151711


Unnamed: 0,Feature,Coefficient
0,const,0.0
1,Number of Reviews,2767.651899
2,Bedrooms,4481.807614
3,Bathrooms,2885.378475
4,Airbnb Superhost,890.52543
5,Minimum Stay,-566.467422
6,Listing Type_entire_home,4048.609471
7,Listing Type_hotel_room,1136.854193
8,Airbnb Location Rating,1755.352191
9,Minimum Stay,-566.467422


### Question: Which Rating sub-category is most influencial in determining the annual revenue of a property

During the EDA, it was found that many of the numerical columns had a significantly 'right skewed' distribution. Hence, a log transformation was used to make provide a more normally distribution pattern for these columns. 

Several selected columns that are semantically important to determining the annual revenue will be used to with their values log transformed. The selected columns are:
- 'Bathrooms'
- 'Bedrooms'
- 'Airbnb Location Rating'
- 'Airbnb Overall Rating'
- 'Minimum Stay'
- 'Count Reservation Days LTM'

In [38]:
# Dataframe for specified columns
semantic_cols = X[['Bathrooms', 'Bedrooms', 'Airbnb Location Rating', 'Overall Rating', 'Minimum Stay', 'Cleaning Fee (Native)']]

# Create the log columns with +1, so the log of zero isn't taken:
cols_log = np.log(semantic_cols + 1)

In [39]:
# Add some additional binary columns that are semantically relevant
X_v3 = pd.concat([cols_log, X[['Listing Type_entire_home', 'Cancellation Policy_super_strict', 'Listing Type_private_room', 'Pets Allowed',
                                'Airbnb Superhost', 'Cancellation Policy_no_policy']]], axis=1)

In [40]:
Linear_Regression(X_v3, y)

The MSE value is:325832610.2643902
The r2 values are: 0.22701166856200516


Unnamed: 0,Feature,Coefficient
0,const,0.0
1,Bathrooms,2380.337087
2,Bedrooms,4609.237514
3,Airbnb Location Rating,1502.011305
4,Overall Rating,-464.127329
5,Minimum Stay,-3716.577524
6,Cleaning Fee (Native),1746.414633
7,Listing Type_entire_home,1643.769652
8,Cancellation Policy_super_strict,1156.722918
9,Listing Type_private_room,-2213.669007


### Further Specification of Independent Columns

Currently, the independent variables 'X' consist of all numerical columns, except for the target column and a column which has null values present.

More concise selection of the independent variables will occur, with 'log' transformations if necessary.

In [41]:
X.head().T

Unnamed: 0,0,1,2,3,4
Number of Reviews,9.0,11.0,1.0,20.0,0.0
Bedrooms,2.0,2.0,1.0,2.0,1.0
Bathrooms,2.0,1.0,2.0,1.0,1.0
Max Guests,6.0,4.0,3.0,4.0,2.0
Airbnb Superhost,0.0,1.0,0.0,0.0,0.0
Cleaning Fee (Native),154.8,0.0,0.0,34.8,0.0
Extra People Fee(Native),0.0,0.0,0.0,2.5,0.0
Minimum Stay,3.0,1.0,7.0,5.0,5.0
Latitude,51.43105,51.43399,51.435,51.43531,51.43532
Longitude,-0.26074,-0.25656,-0.257,-0.25648,-0.25413


Columns to be dropped that, for semantic reasons, can be dropped for the next model:
- Latitude
- Longitude
- Airbnb Host ID

There are many columns which all describe similar information regarding reservation days/occupancy rates. Strong multicollinearity is present between these columns, therefore only one column regarding this type of information will be included. 'Occupancy Rate LTM' will be left in, the columns to be dropped are:
- Count Available Days LTM	
- Count Blocked Days LTM	
- Count Reservation Days LTM	
- Number of Bookings LTM
- Number of Bookings LTM - Number of observed month	

An updated X dataframe will be created ommitting these columns:

In [42]:
X_v4 = X.drop(columns =['Latitude', 'Longitude', 'Airbnb Host ID', 'Count Available Days LTM', 'Count Reservation Days LTM', 'Number of Bookings LTM', 'Number of Bookings LTM - Number of observed month'])

In [43]:
X_v4

Unnamed: 0,Number of Reviews,Bedrooms,Bathrooms,Max Guests,Airbnb Superhost,Cleaning Fee (Native),Extra People Fee(Native),Minimum Stay,Overall Rating,Airbnb Communication Rating,...,Cancellation Policy_no_policy,Cancellation Policy_strict,Cancellation Policy_super_strict,Checkout Time_afternoon,Checkout Time_morning,Checkout Time_very_early,Listing Type_entire_home,Listing Type_hotel_room,Listing Type_private_room,Listing Type_shared_room
0,9,2.0,2,6,0,154.8,0.0,3,4.1,9.0,...,0,1,0,0,1,0,1,0,0,0
1,11,2.0,1,4,1,0.0,0.0,1,4.8,10.0,...,1,0,0,0,1,0,1,0,0,0
2,1,1.0,2,3,0,0.0,0.0,7,5.0,10.0,...,1,0,0,0,1,0,1,0,0,0
3,20,2.0,1,4,0,34.8,2.5,5,4.8,10.0,...,0,1,0,0,1,0,1,0,0,0
4,0,1.0,1,2,0,0.0,0.0,5,4.0,10.0,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32673,2,1.0,2,2,0,26.9,0.0,3,4.0,10.0,...,0,0,0,1,0,0,1,0,0,0
32674,12,1.0,1,2,0,15.8,0.0,1,4.8,10.0,...,0,0,0,1,0,0,0,0,1,0
32675,11,1.0,1,2,0,0.0,0.0,2,4.8,10.0,...,1,0,0,0,1,0,0,0,1,0
32676,6,1.0,1,2,0,10.3,10.2,3,5.0,10.0,...,0,1,0,0,1,0,0,0,1,0


Of these values, there are numerous continuous numeric values. As discovered during the EDA, many of these continuous variables have significantly skewed distributions. A 'log' transformation for these variables will be completed, to improve their functionality in the subsequent modelling.

Initially, the continuous variables need to separated:

In [44]:
num_cols_log = np.log(num_cols.drop(columns = ['Longitude', 'Latitude', 'Number of Bookings LTM - Number of observed month']) + 1)

NameError: name 'num_cols' is not defined

In [None]:
num_cols_log.head().T

In [None]:
X_v4.head().T

Number of Reviews, Bedrooms bathrooms max guests cleaning fee, extra people, minimum stay, overall rating, all ratings, occupancy rate ltm, average daily rate (native).

In [None]:
X_con = X_v4[['Number of Reviews', 'Bedrooms', 'Bathrooms', 'Max Guests', 'Cleaning Fee (Native)', 
              'Extra People Fee(Native)', 'Minimum Stay', 'Overall Rating', 'Airbnb Communication Rating', 'Airbnb Accuracy Rating',
             'Airbnb Checkin Rating', 'Airbnb Location Rating', 'Airbnb Value Rating', 'Occupancy Rate LTM', 'Average Daily Rate (Native)']]

In [None]:
X_con

In [None]:
# Log scale all columns, add 1 to prevent the log of zero occuring
X_con_log = np.log(X_con + 1)

In [None]:
X_con_log.info()

These 'log' transformed continuous variables need to be be combined into a single dataframe with the binary columns.

In [None]:
X_v4.drop(columns= ['Number of Reviews', 'Bedrooms', 'Bathrooms', 'Max Guests', 'Cleaning Fee (Native)', 
              'Extra People Fee(Native)', 'Minimum Stay', 'Overall Rating', 'Airbnb Communication Rating', 'Airbnb Accuracy Rating',
             'Airbnb Checkin Rating', 'Airbnb Location Rating', 'Airbnb Value Rating', 'Occupancy Rate LTM', 'Average Daily Rate (Native)'], inplace=True)

In [None]:
X_v4 = pd.concat([X_con_log, X_v4], axis=1)

## Model Again Using OLS with Multiple Variables - Sanity Check

In [None]:
X_withconstant = sm.add_constant(X_v4)

In [None]:
y_log

In [None]:
# Follow usual procedure
lm_airbnb = sm.OLS(y_log,X_withconstant)
lm_airbnb_results = lm_airbnb.fit()
lm_airbnb_results.summary()

## Different versions:

In [None]:
# standard X, not scaled or log
X_v4

In [None]:
# standard y, not scaled or log
y

In [None]:
# X scaled
X_ss = StandardScaler().fit_transform(X_v4)
X_ss

In [None]:
# X logg

In [None]:
# y logged
y_log = np.log(y)
y_log

## Various Model Types

Try comparing different versions of the dataframe x and y to see which produces the best R^2 result:

In [None]:
# add constant
X = sm.add_constant(X)

In [None]:
# Standard X and y
lm_airbnb = sm.OLS(y, X)
lm_airbnb_results = lm_airbnb.fit()
lm_airbnb_results.summary()

In [None]:
# Scaled x and standard y 
X_v5 = X.drop(columns = ['Longitude', 'Latitude', 'Airbnb Accuracy Rating', 'Airbnb Checkin Rating', 'Count Available Days LTM',
                        'Count Blocked Days LTM', 'Count Reservation Days LTM', 'Number of Bookings LTM', 'Cleaning Fee (Native)',
                        'Extra People Fee(Native)', 'Average Daily Rate (Native)', 'Cancellation Policy_no_policy'])

In [None]:
# Standard X and y
X_v5_const = sm.add_constant(X_v5)

lm_airbnb = sm.OLS(y, X_v5_const)
lm_airbnb_results = lm_airbnb.fit()
lm_airbnb_results.summary()