Aim: To predict the actual scale of guest satisfactory based on a few factors such as type of airbnb or cleanliness of airbnb 

4 Machine Learning Models 
- Random Forest Regressor 
    - before normalization, after normalization, with gradient weight 
- K Nearest Neighbour 
    - before normalization, after normalization, with inverse distance
- Stochastic Gradient Descent 
    - before hyperparameter tuning, after hyperparameter tuning
- Linear Regression 

In [50]:
import numpy as np
import pandas as pd
import seaborn as sb
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import RandomizedSearchCV

**Data preprocess**

In [26]:
# load data into pandas dataframe
df=pd.read_csv('Airbnb.csv')

In [27]:
#replace bracket and spacing to ease calculation 
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('(', '')
df.columns = df.columns.str.replace(')', '')

In [28]:
#convert boolean to integer
df.shared_room = df.shared_room.replace({True: '1', False: '0'}) 
df.private_room = df.private_room.replace({True: '1', False: '0'}) 
df.superhost = df.superhost.replace({True: '1', False: '0'}) 
df.head()

Unnamed: 0,city,price,day,room_type,shared_room,private_room,person_capacity,superhost,multiple_rooms,business,cleanliness_rating,guest_satisfaction,bedrooms,city_center_km,metro_distance_km,attraction_index,normalised_attraction_index,restraunt_index,normalised_restraunt_index
0,Amsterdam,194.033698,Weekday,Private room,0,1,2.0,0,1,0,10.0,93.0,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473
1,Amsterdam,344.245776,Weekday,Private room,0,1,4.0,0,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928
2,Amsterdam,264.101422,Weekday,Private room,0,1,2.0,0,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467
3,Amsterdam,433.529398,Weekday,Private room,0,1,4.0,0,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565
4,Amsterdam,485.552926,Weekday,Private room,0,1,2.0,1,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677


In [29]:
#convert nominal to numerical 
label = LabelEncoder()

label.fit(df.city) 
df.city = label.transform(df.city)

label.fit(df.day) 
df.day = label.transform(df.day)

label.fit(df.room_type) 
df.room_type = label.transform(df.room_type)

df.head()

Unnamed: 0,city,price,day,room_type,shared_room,private_room,person_capacity,superhost,multiple_rooms,business,cleanliness_rating,guest_satisfaction,bedrooms,city_center_km,metro_distance_km,attraction_index,normalised_attraction_index,restraunt_index,normalised_restraunt_index
0,0,194.033698,0,1,0,1,2.0,0,1,0,10.0,93.0,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473
1,0,344.245776,0,1,0,1,4.0,0,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928
2,0,264.101422,0,1,0,1,2.0,0,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467
3,0,433.529398,0,1,0,1,4.0,0,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565
4,0,485.552926,0,1,0,1,2.0,1,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677


**Random Tree Regressor before normalization**

In [30]:
#split data
x=df.drop(['guest_satisfaction'], axis=1).values
y=df['guest_satisfaction'].values

In [31]:
#split to train test 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [32]:
randomforest=RandomForestRegressor(n_estimators=1000)
randomforest.fit(x_train, y_train.ravel())

RandomForestRegressor(n_estimators=1000)

In [33]:
#define mae, mse, rmse
y_pred=randomforest.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [34]:
#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  2.7678977227327213
Mean Squared Error:  19.45842494063124
Root Mean Squared Error:  4.411170472859923


**Random Tree Regressor after normalization**

In [35]:
#split data
x=df.drop(['guest_satisfaction'], axis=1).values
y=df['guest_satisfaction'].values

In [36]:
#normalize
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
y_df=pd.DataFrame(y, columns=['guest_satisfactory'])

In [37]:
#split to train test 
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=42, shuffle=True)

In [38]:
randomforest=RandomForestRegressor()
randomforest.fit(x_train, y_train.ravel())

RandomForestRegressor()

In [39]:
#define mae, mse, rmse
y_pred=randomforest.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [40]:
#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  2.790313224131043
Mean Squared Error:  19.81028133439872
Root Mean Squared Error:  4.450874221363565


**Random Tree Regressor model that implemented gradient weight in it**

In [41]:
class enhance_random_tree_1:
    def __init__(self, n_estimators=10000, max_depth=10, learning_rate=0.01):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.estimators = []
        self.weights = []
        
    
    def fit(self, x_train, y_train):
        num_samples = x_train.shape[0]
        prediction = np.full(num_samples, np.mean(y_train))

        for _ in range(self.n_estimators):
            residuals = y_train - prediction
            estimator = DecisionTreeRegressor(max_depth=self.max_depth)
            estimator.fit(x_train, residuals)
            self.estimators.append(estimator)

            gradients = np.sign(residuals)

            prediction += self.learning_rate * estimator.predict(x_train)
            self.weights.append(self.learning_rate)

    def predict(self, x_test):
        num_samples = x_test.shape[0]
        predictions = np.zeros(num_samples)

        for estimator, weight in zip(self.estimators, self.weights):
            predictions += weight * estimator.predict(x_test)

        return predictions


In [42]:
enhance1 = enhance_random_tree_1(n_estimators=10000, max_depth=10, learning_rate=0.01)
enhance1.fit(x_train, y_train.ravel())

In [43]:
y_pred = enhance1.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [44]:
#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  93.12579011043591
Mean Squared Error:  8690.617201417008
Root Mean Squared Error:  93.22347988257576


**KNN without normalization**

In [45]:
#split data
x=df.drop(['guest_satisfaction'], axis=1).values
y=df['guest_satisfaction'].values

In [46]:
#split to train test 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [52]:
rmse_val=[]
best_rmse=1.0

for k in range(20):
    k = k+1
    knn = neighbors.KNeighborsRegressor(n_neighbors = k)

    knn.fit(x_train, y_train) 
    y_pred = knn.predict(x_test)
    rmse = np.sqrt(mean_squared_log_error(y_test,y_pred))
    if (rmse < best_rmse):
        best_rmse = rmse
        best_k = k
    rmse_val.append(rmse)
    print('RMSE value for k= ' , k , 'is:', rmse)

print(f"Best RMSE: {best_rmse}, Best k: {best_k}")

RMSE value for k=  1 is: 0.1127620289159227
RMSE value for k=  2 is: 0.11044237880260253
RMSE value for k=  3 is: 0.1113021194961765
RMSE value for k=  4 is: 0.11133354921220939
RMSE value for k=  5 is: 0.11131527582940978
RMSE value for k=  6 is: 0.11147942517341508
RMSE value for k=  7 is: 0.11164047233148039
RMSE value for k=  8 is: 0.11165731812476443
RMSE value for k=  9 is: 0.11171451648271927
RMSE value for k=  10 is: 0.11174176371082513
RMSE value for k=  11 is: 0.11189960321272573
RMSE value for k=  12 is: 0.11191053130590094
RMSE value for k=  13 is: 0.11197871620516961
RMSE value for k=  14 is: 0.11200551190172199
RMSE value for k=  15 is: 0.11213012049121356
RMSE value for k=  16 is: 0.11219583686605951
RMSE value for k=  17 is: 0.11218459969937578
RMSE value for k=  18 is: 0.11227194761725158
RMSE value for k=  19 is: 0.1123507399128917
RMSE value for k=  20 is: 0.11228188912326571
Best RMSE: 0.11044237880260253, Best k: 2


In [53]:
params = {'n_neighbors':[2,3,4,5,6,7,8,10]}

knn = neighbors.KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(x_train,y_train)
model.best_params_

{'n_neighbors': 10}

In [54]:
knn = neighbors.KNeighborsRegressor(n_neighbors = best_k)

knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
knn.score(x_test, y_test)

0.0006874499908993137

In [55]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  5.210906911705953
Mean Squared Error:  66.76913703555734
Root Mean Squared Error:  8.171238402810026


**KNN with normalization**

In [56]:
#split to train test 
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=42, shuffle=True)

In [57]:
rmse_val=[]
best_rmse=1.0

for k in range(20):
    k = k+1
    knn = neighbors.KNeighborsRegressor(n_neighbors = k)

    knn.fit(x_train, y_train) 
    y_pred = knn.predict(x_test)
    rmse = np.sqrt(mean_squared_log_error(y_test,y_pred))
    if (rmse < best_rmse):
        best_rmse = rmse
        best_k = k
    rmse_val.append(rmse)
    print('RMSE value for k= ' , k , 'is:', rmse)

print(f"Best RMSE: {best_rmse}, Best k: {best_k}")

RMSE value for k=  1 is: 0.10074852197114793
RMSE value for k=  2 is: 0.08488709656164858
RMSE value for k=  3 is: 0.08074867260029865
RMSE value for k=  4 is: 0.07965725396324481
RMSE value for k=  5 is: 0.07886735305579098
RMSE value for k=  6 is: 0.07861246305284834
RMSE value for k=  7 is: 0.07796484528554296
RMSE value for k=  8 is: 0.07781571633898067
RMSE value for k=  9 is: 0.07817736300048127
RMSE value for k=  10 is: 0.07797374849228245
RMSE value for k=  11 is: 0.07827342987901233
RMSE value for k=  12 is: 0.0784016468944371
RMSE value for k=  13 is: 0.07825543831176916
RMSE value for k=  14 is: 0.07837750725601465
RMSE value for k=  15 is: 0.07844054894930633
RMSE value for k=  16 is: 0.07850927574631439
RMSE value for k=  17 is: 0.07849538508926159
RMSE value for k=  18 is: 0.0785122042495735
RMSE value for k=  19 is: 0.07866874788802601
RMSE value for k=  20 is: 0.07862415978052391
Best RMSE: 0.07781571633898067, Best k: 8


In [58]:
#Hyperparameter tuning using Grid Search
params = {'n_neighbors':[2,3,4,5,6,7,8,9,10]}

knn = neighbors.KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(x_train,y_train)
model.best_params_

{'n_neighbors': 10}

In [59]:
knn = neighbors.KNeighborsRegressor(n_neighbors = best_k)

knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
knn.score(x_test, y_test)

0.5024112485628067

In [60]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)


Mean Absolute Error:  3.80331602077507
Mean Squared Error:  33.246426787854574
Root Mean Squared Error:  5.765971452223344


**KNN using inverse distance**

In [61]:
rmse_val=[]
best_rmse=1.0

for k in range(20):
    k = k+1
    knn = neighbors.KNeighborsRegressor(n_neighbors = k)

    knn.fit(x_train, y_train) 
    y_pred = knn.predict(x_test)
    rmse = np.sqrt(mean_squared_log_error(y_test,y_pred))
    if (rmse < best_rmse):
        best_rmse = rmse
        best_k = k
    rmse_val.append(rmse)
    print('RMSE value for k= ' , k , 'is:', rmse)

print(f"Best RMSE: {best_rmse}, Best k: {best_k}")


RMSE value for k=  1 is: 0.10074852197114793
RMSE value for k=  2 is: 0.08488709656164858
RMSE value for k=  3 is: 0.08074867260029865
RMSE value for k=  4 is: 0.07965725396324481
RMSE value for k=  5 is: 0.07886735305579098
RMSE value for k=  6 is: 0.07861246305284834
RMSE value for k=  7 is: 0.07796484528554296
RMSE value for k=  8 is: 0.07781571633898067
RMSE value for k=  9 is: 0.07817736300048127
RMSE value for k=  10 is: 0.07797374849228245
RMSE value for k=  11 is: 0.07827342987901233
RMSE value for k=  12 is: 0.0784016468944371
RMSE value for k=  13 is: 0.07825543831176916
RMSE value for k=  14 is: 0.07837750725601465
RMSE value for k=  15 is: 0.07844054894930633
RMSE value for k=  16 is: 0.07850927574631439
RMSE value for k=  17 is: 0.07849538508926159
RMSE value for k=  18 is: 0.0785122042495735
RMSE value for k=  19 is: 0.07866874788802601
RMSE value for k=  20 is: 0.07862415978052391
Best RMSE: 0.07781571633898067, Best k: 8


In [62]:
knn = neighbors.KNeighborsRegressor(n_neighbors = best_k,weights='distance')

knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)

In [63]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  3.7815146672926705
Mean Squared Error:  33.26178556345874
Root Mean Squared Error:  5.767303144751343


**Stochastic Gradient Descent before hyperparameter tuning**

In [64]:
SGD=SGDRegressor()
SGD.fit(x_train, y_train.ravel())
y_pred=SGD.predict(x_test)

In [65]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  3.695491362537715
Mean Squared Error:  31.636482766117503
Root Mean Squared Error:  5.624631789381195


**Stochastic Gradient Descent after hyperparameter tuning**

In [66]:
# report function to tell results of the hyperparamter tuning
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} ".format(results["mean_test_score"][candidate])
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [69]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet',None],
    'max_iter' :[20000, 30000, 40000],
    'eta0' :[0.001, 0.0001, 0.00001, 0.000001],
    'n_iter_no_change':[100, 200, 250, 300]
}

SGD2=SGDRegressor(random_state=42)
random_SGD=RandomizedSearchCV(SGD2,param_grid, n_iter=10, cv=5, random_state=42)
random_SGD.fit(x_train, y_train.ravel())

report(random_SGD.cv_results_)

Rank: 1
Mean validation score: 0.500 
Parameters: {'penalty': 'l1', 'n_iter_no_change': 300, 'max_iter': 30000, 'eta0': 0.0001}

Rank: 2
Mean validation score: 0.500 
Parameters: {'penalty': 'l2', 'n_iter_no_change': 250, 'max_iter': 20000, 'eta0': 0.001}

Rank: 3
Mean validation score: 0.500 
Parameters: {'penalty': 'l1', 'n_iter_no_change': 100, 'max_iter': 30000, 'eta0': 0.001}



In [73]:
SGD_best=SGDRegressor(penalty='L1', n_iter_no_change=300, eta0=0.0001, max_iter=30000)
SGD_best.fit(x_train, y_train)
SGD_best_pred=SGD_best.predict(x_test)

In [74]:
mae = metrics.mean_absolute_error(y_test, SGD_best_pred)
mse = metrics.mean_squared_error(y_test, SGD_best_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, SGD_best_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  3.6660355275499583
Mean Squared Error:  31.606209317098102
Root Mean Squared Error:  5.621939995864248


**Linear Regression**

In [75]:
linreg=LinearRegression()

linreg.fit(x_train, y_train) 
linreg_pred = linreg.predict(x_test)

In [76]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#print result
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  3.695491362537715
Mean Squared Error:  31.636482766117503
Root Mean Squared Error:  5.624631789381195


In [None]:
下