In [344]:
import numpy as np 
import pandas as pd
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [345]:
saudi_review = pd.read_csv("output.csv")
saudi_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1442 entries, 0 to 1441
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1442 non-null   int64  
 1   name               1442 non-null   object 
 2   location           1442 non-null   object 
 3   price              1442 non-null   object 
 4   price_for          1442 non-null   object 
 5   room_type          1442 non-null   object 
 6   beds               1434 non-null   object 
 7   rating             1394 non-null   float64
 8   rating_title       1394 non-null   object 
 9   number_of_ratings  1394 non-null   object 
 10  url                1442 non-null   object 
 11  cm                 257 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 135.3+ KB


In [346]:
# Drop values where no review is available as this is the metric being predicted 
saudi_review = saudi_review[saudi_review ['rating'].notna() == True].reset_index()

In [347]:
#Basic expoloration of data 
#Drop url as all unique and no usefull features drawn from it as not very discernible.
for i in range(0,len(saudi_review.select_dtypes(include = 'object').columns)):
    print(saudi_review.select_dtypes(include = 'object').columns[i],':',len(saudi_review.select_dtypes(include = 'object').iloc[:,i].drop_duplicates()))

name : 864
location : 143
price : 351
price_for : 2
room_type : 250
beds : 74
rating_title : 6
number_of_ratings : 756
url : 1344
cm : 111


In [348]:
#change the data to useable 
#Drop name as will cause data leakage if used 
saudi_review = saudi_review.drop(columns = ["url","name","Unnamed: 0","rating_title"])

In [349]:
##############Look into splitting up location
location = np.array([])
for i in range(0,saudi_review["location"].nunique()):
    location = np.append(location,saudi_review["location"].unique()[i].replace(" Show on map","").lower())
location
#Not usefull as can cause bias for certain locations as there is 143 unique locations for 1400 data points 
#could use Kmeans but still far to many and would skew data 
#so leakage will be most likely 

array(['dammam', 'al muraysīyah', 'al olaya, riyadh', 'al qurayyat',
       'al hamra, jeddah', 'riyadh', 'al wadeen', 'al olayya, al khobar',
       'buraydah', 'central madinah, al madinah', 'ţurayf', 'al khobar',
       'makkah', 'unayzah', 'al worood, riyadh', 'yanbu', 'jeddah',
       'as suwayfilah', 'al ḩawīyah', 'al malaz, riyadh', 'sharurah',
       'al khars', 'tanomah', 'khamis mushayt', 'az zulfi', 'al namas',
       'al ‘awālī', 'rafha', 'rabigh', 'king abdullah economic city',
       'riyadh al khabra', 'al rass', 'hajlah', 'al ula', 'taymāʼ',
       'abha', 'fayfāʼ', 'waḩţān', 'al lith', 'sīdī ḩamzah', 'qabāʼ',
       'al barkah', 'raghdān', 'ma‘shī', 'ad darb', 'bīshat mushayţ',
       'al jāmi‘ah', 'ad dawādimī', 'as sayl aş şaghīr',
       'al salamah, jeddah', 'tabuk', 'taif', 'al sulimania, riyadh',
       'al madinah', 'turghush', 'hera street, jeddah', '‘urwah',
       'madain saleh', 'sari street, jeddah', "qal'at bishah",
       'al wuhayţ', 'as sudayrah', 'al j

In [350]:
####################look into the price distribution 
saudi_review.price.apply(lambda x: x.replace("SAR",""))

0        140
1        180
2        248
3        250
4        224
        ... 
1389     285
1390     209
1391     150
1392     189
1393      81
Name: price, Length: 1394, dtype: object

In [351]:
# Only two types of strings without the Numbers showing it is all the same currency 
saudi_review['price'].str.replace('\d+', '').unique()

#change the price to a numeric useable value 
saudi_price = pd.DataFrame(pd.to_numeric(saudi_review.price.str.extract("(\d*\.?\d+)", expand=False)))

In [352]:
####################look into price for 
saudi_review.price_for.unique()
#Only 2 outputs with 1 or 2 adults so will be split into two outputs of 1 or 2 per night ordinal encoding used to determine
#the number of individuals staying in a room

#change the number of people in a room to a usable float parameter
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[['1 night, 1 adult', '1 night, 2 adults']])
people_a_night = pd.DataFrame(enc.fit_transform(saudi_review["price_for"].values.reshape(-1,1)) + 1,columns = ['PPN'])


In [353]:
#find the total number of beds from beds
total_beds = pd.DataFrame(pd.to_numeric(saudi_review['beds'].str.split().str.get(0), errors='coerce').fillna(0))

In [354]:
#####################All 3 descriptive columns 
#saudi_review.room_type.unique()
#find usefull repetetive words that can be used to encode each type of room 

In [355]:
# Combine the 3 most descriptive columns so dummy encoding can be used 
cols = ['room_type','cm','beds']
saudi_review['combined'] = saudi_review[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [356]:
#Find the most used words overall to put into the encoder.
room_str = ' '.join([str(x).lower() for x in saudi_review['combined']])
print(pd.DataFrame(data=np.array(room_str.split(" ")), columns=["col1"])["col1"].value_counts()[0:40].to_string())
#Try top 20 to see how many values it takes to describe most locations 

double         1326
1              1240
nan            1146
bed            1013
room            780
extra-large     630
2               450
single          368
large           365
beds            353
•­              347
apartment       298
deluxe          232
studio          210
one-bedroom     210
king            199
standard        179
sofa            166
twin            159
bathroom        152
3               143
double)         140
singles, 1      138
with            133
suite           120
beds\r\n(2      111
private          89
beds\r\n(1       87
superior         83
bed)             79
bedroom          75
or               72
entire           72
two-bedroom      69
view             66
double, 1        62
living           61
apartment­       59
4                48
budget           48


In [357]:
#Create list of most used values and prune non usefull ones like bed that the model may falsely pick up on 
#Use values that cover 5% or more of hotels
#small nuber of square meter room sizes used so not usefull 
cv_list = ['double', 'extra-large', 'large','single','apartment', 'deluxe', 'one-bedroom', 'studio',
       'king', 'sofa', 'standard', 'twin', 'bathroom', 'double)',
       'singles,', 'suite','private', 'superior', 'two-bedroom', 'entire',
       'double,', 'view', 'living', 'apartment­','budget', 'economy', 'multiple', 'singles', 'city',
       'junior', 'chalet', 'studio­', 'queen', 'suite­','family', 'triple', 'guest', 'executive', 'classic',
       'double,', 'villa', 'single,', 'quadruple', 'singles,', 'non-smoking',
       'double,', 'three-bedroom', 'haram', 'sea',
       'king,','small', 'doubles)', 'garden','pool']

In [358]:
#Allows the values to all be tunred to lower case before dummies work on the values 
zzz = saudi_review['combined'].str.lower()

final_encode = (pd.get_dummies(zzz.str.split(expand=True))
         .groupby(lambda x: x.split('_')[-1],axis=1).sum())


In [359]:
#reduce down the table to binary values and combine the same values
#sorted(final_encode[cv_list].columns.values)

In [360]:
#Reduce down double values 
double = ['double','double)','double,','double,','double,','doubles)']
final_encode['double_tot'] = final_encode[double].sum(axis = 1)

In [361]:
#reduce single
single = ['single','single,','singles','singles,','singles,']
final_encode['single_tot'] = final_encode[single].sum(axis = 1)

In [362]:
#Reduce king
king = ['king','king,']
final_encode['king_tot'] = final_encode[king].sum(axis = 1)

In [363]:
#reduce apartment
apartment = ['apartment','apartment\xad']
final_encode['apartment_tot'] = final_encode[apartment].sum(axis = 1)

In [364]:
#reduce suite 
suite = ['suite','suite\xad']
final_encode['suite_tot'] = final_encode[suite].sum(axis = 1)

In [365]:
#Reduce studio
#'studio','studio\xad'
studio = ['studio','studio\xad']
final_encode['studio_tot'] = final_encode[studio].sum(axis = 1)

In [366]:
#final list of values 
cv_list_final = ['double_tot', 'extra-large', 'large','single_tot','apartment_tot', 'deluxe', 'one-bedroom', 'studio_tot',
       'king_tot', 'sofa', 'standard', 'twin', 'bathroom', 'suite_tot','private', 'superior', 'two-bedroom', 'entire',
       'view', 'living','budget', 'economy', 'multiple', 'city',
       'junior', 'chalet', 'queen', 'family', 'triple', 'guest', 'executive', 'classic',
       'villa', 'quadruple','non-smoking', 'three-bedroom', 'haram', 'sea','small', 'garden','pool']

#Create the final dummy encoded values 
final_ohe = final_encode[cv_list_final].where(final_encode[cv_list_final] < 1, other=1)

In [367]:
#Start building the initial model and test performence with basic ohe
#Pipeline not needed as pre proccesing of data was manual process 
X = pd.concat([final_ohe,total_beds,saudi_price,people_a_night],axis = 1)
#X = (X - X.mean(axis=0)) / X.std(axis=0)
y = saudi_review['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [368]:
XG_model = XGBRegressor(random_state = 0)
XG_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [369]:
#mean absolute eorror from the model
predictions = XG_model.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error: 0.5767564257416962


In [370]:
#Find out which metrics preduce the lowest mae 
for i in [final_ohe,total_beds,saudi_price,people_a_night]:
    #Pipeline not needed as pre proccesing of data was manual process 
    X_single = pd.concat([i],axis = 1)
    y = saudi_review['rating']
    
    X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.33, random_state=0)
    
    XG_model = XGBRegressor(random_state = 0)
    XG_model.fit(X_train_single, y_train_single)
    
    predictions = XG_model.predict(X_test_single)
    print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_test)))

#Price produces the best prediction of the rating but all togethor produce the best overall prediction 

Mean Absolute Error: 0.7798505942371559
Mean Absolute Error: 0.8057264557629501
Mean Absolute Error: 0.6704619219401394
Mean Absolute Error: 0.7976481032216367


In [371]:
#better than random guessing the gausian distribution 
diff = np.array([])

for i in range(0,len(y)):
    diff = np.append(diff,((y[i] - np.random.normal(7.56,1.0566396137869813))**2)**0.5)
    
diff = pd.DataFrame(diff)

diff.mean()


0    1.179869
dtype: float64

In [372]:
#also better than guessing the mean 
diff = np.array([])

for i in range(0,len(y)):
    diff = np.append(diff,((y[i] - 7.56)**2)**0.5)
    
diff = pd.DataFrame(diff)

diff.mean()

0    0.815151
dtype: float64

In [373]:
#Feature engineer values to increase model accuracy 

In [374]:
#usefull columns that can be manipulated
#price per person per night  = price/person per night 
#price per bed = price/beds 

In [375]:
price_per_person = X['price']/X['PPN']

price_per_person = price_per_person.rename('price_per_person')

price_per_bed = X['price']/X['beds']
price_per_bed = price_per_bed.replace([np.inf, -np.inf], np.nan)

#Some values produce infinite so correlated with price on missing values so there is no data sku from it being 0 
ppb_to_price = pd.concat([saudi_price,pd.DataFrame(price_per_bed,columns = ['price_per_bed'])],axis = 1).dropna()
mb = np.polyfit(ppb_to_price['price'],ppb_to_price['price_per_bed'], 1)
for i in range(0,len(price_per_bed)):
    if math.isnan(price_per_bed.iloc[i]):
        price_per_bed.iloc[i] = int(mb[0] * saudi_price.iloc[i] + mb[1])
        
price_per_bed = price_per_bed.rename('price_per_bed')

In [376]:
X_eng = pd.concat([final_ohe,total_beds,saudi_price,people_a_night,price_per_person,price_per_bed],axis = 1)
y = saudi_review['rating']
X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(X_eng, y, test_size=0.33, random_state=0)

XG_model = XGBRegressor(random_state = 0)
XG_model.fit(X_train_eng, y_train_eng)

#mean absolute eorror from the model
predictions = XG_model.predict(X_test_eng)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_test_eng)))

Mean Absolute Error: 0.5283527552176452


In [377]:
#Introducing the price per bed and person increased the model accuracy 
#overall showing feature engineering worked for this case 

In [378]:
#Creating PCA values to see if there is a mae decrease using normalised values
from sklearn.decomposition import PCA

pca = PCA()

X_norm = (X_eng - X_eng.mean(axis=0)) / X_eng.std(axis=0)

X_pca = pca.fit_transform(X_norm[['beds','price','PPN','price_per_person', 'price_per_bed']])

# Convert to datafram
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-1.476078,0.144996,0.196652,-0.107158,0.122443
1,-1.199451,0.175313,0.313663,-0.150303,0.076121
2,-1.000459,1.055593,0.668843,-0.148079,0.012600
3,-0.987796,1.057688,0.674213,-0.151925,0.010247
4,-0.376309,-0.742807,0.387913,0.084804,0.017833
...,...,...,...,...,...
1389,0.152403,-0.749544,0.610291,0.173580,-0.049401
1390,-0.998897,0.197293,0.398495,-0.181582,0.042537
1391,-1.406921,0.152575,0.225905,-0.117944,0.110863
1392,-1.555676,1.778108,0.703536,0.170616,0.100112


In [379]:
X_pca_tot = pd.concat([final_ohe,X_pca],axis = 1)
y = saudi_review['rating']
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca_tot, y, test_size=0.33, random_state=0)

XG_model = XGBRegressor(random_state = 0)
XG_model.fit(X_train_pca, y_train_pca)

#mean absolute eorror from the model
predictions = XG_model.predict(X_test_pca)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_test_pca)))
#PCA does not decrease the mean absolute error meaning values are not highly correlated 

Mean Absolute Error: 0.5917738520400903


In [381]:
X_norm[['beds','price','PPN','price_per_person', 'price_per_bed']].cov()
#As can be seen very week correlations observed 
#only price correlations can be seen but this is already taken into account by the model as the correlations are large.

Unnamed: 0,beds,price,PPN,price_per_person,price_per_bed
beds,1.0,-0.006225,0.077961,-0.061081,-0.400337
price,-0.006225,1.0,-0.183209,0.865532,0.803582
PPN,0.077961,-0.183209,1.0,-0.57282,-0.221492
price_per_person,-0.061081,0.865532,-0.57282,1.0,0.760257
price_per_bed,-0.400337,0.803582,-0.221492,0.760257,1.0
