In [56]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# import required libraries
from scipy import stats
import statsmodels.api as sm

In [109]:
# Import data 
airbnb_ldn = pd.read_csv('airbnb_ldn_pp.csv')

In [110]:
# drop 'Unnamed: 0'
airbnb_ldn = airbnb_ldn.drop(columns = 'Unnamed: 0')

In [111]:
X = airbnb_ldn.select_dtypes(exclude='object').drop(columns = ['Annual Revenue LTM (Native)', 'Host Listing Count'])
y = airbnb_ldn['Annual Revenue LTM (Native)']

In [52]:
# download required sklearn packages:
from sklearn.model_selection import train_test_split

In [122]:
def Linear_Regression(X, y):

    # splitting data into train and test set:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

    # reset index for X training set:
    X_train.reset_index(inplace=True)

    # drop created 'index' column:
    X_train = X_train.drop(columns='index').copy()

    X_test.reset_index(inplace=True)

    X_test = X_test.drop(columns='index').copy()

    # complete the same tranformation for the y_test data:
    # convert the series to a dataframe:
    y_train = y_train.to_frame()

    # reset index
    y_train.reset_index(inplace=True)

    # drop 'index' column
    y_train.drop(columns = 'index', inplace=True)

    # return column to series
    y_train = y_train.squeeze()

    # convert the series to a dataframe:
    y_test = y_test.to_frame()

    # reset index
    y_test.reset_index(inplace=True)

    # drop 'index' column
    y_test.drop(columns = 'index', inplace=True)

    # return column to series
    y_test = y_test.squeeze()

    # Add constants to the train and test X dataframes:
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Scale the train and test X dataframes:
    X_train_ss = StandardScaler().fit_transform(X_train)
    X_test_ss = StandardScaler().fit_transform(X_test)

    # Creating the Linear Regression Model

    # Instantiate the model
    linreg = LinearRegression()

    # Fit the model:
    linreg.fit(X_train_ss, y_train)

    # Evaluate the model:
    y_pred = linreg.predict(X_test_ss)

    # Determine the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"The MSE value is:{mse}")

    # Find the R2 value:
    r2 = r2_score(y_test, y_pred)
    print(f"The r2 values are: {r2}")

    # Viewing the various independent variable coefficients:
    coef = linreg.coef_
    feature_names = X_test.columns

    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coef})
    return(coef_df)

In [123]:
Linear_Regression(X,y)

The MSE value is:124963582.54997213
The r2 values are: 0.7035428986452991


Unnamed: 0,Feature,Coefficient
0,const,-4.763324e-15
1,Number of Reviews,-1289.356
2,Bedrooms,-344.4098
3,Bathrooms,146.3764
4,Max Guests,1480.008
5,Airbnb Superhost,201.441
6,Cleaning Fee (Native),1661.042
7,Extra People Fee(Native),-174.5921
8,Minimum Stay,-152.2199
9,Latitude,36.21114


In [116]:
X_opti = X.drop(columns = ['Count Blocked Days LTM', 'Checkout Time_evening', 'Checkout Time_late', 'Checkout Time_none',
                            'Latitude', 'Longitude', 'Airbnb Host ID', 'Checkout Time_none', 'Listing Type_shared_room',
                            'Count Reservation Days LTM', 'Count Available Days LTM', 'Number of Bookings LTM - Number of observed month', 
                            'Checkout Time_very_early', 'Airbnb Checkin Rating'])

In [117]:
Linear_Regression(X_opti,y)

The MSE value is:182349874.1074073
The r2 values are: 0.5674026463777253
                             Feature   Coefficient
0                              const  1.644202e-16
1                  Number of Reviews -1.088645e+03
2                           Bedrooms -4.572137e+02
3                          Bathrooms  2.644209e+02
4                         Max Guests  1.199332e+03
5                   Airbnb Superhost  1.306476e+03
6              Cleaning Fee (Native)  1.986833e+03
7           Extra People Fee(Native)  2.057161e+02
8                       Minimum Stay -3.780394e+02
9                     Overall Rating  4.321471e+02
10       Airbnb Communication Rating  2.520887e+02
11            Airbnb Accuracy Rating  4.230349e+01
12            Airbnb Location Rating  2.190976e+02
13               Airbnb Value Rating -8.982326e+02
14                      Pets Allowed  8.808670e+01
15                Occupancy Rate LTM  3.586176e+03
16            Number of Bookings LTM  8.711460e+03
17       