In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier



from sklearn.model_selection import GridSearchCV





In [3]:
data = pd.read_csv("Radisson Blu Seaside Hotel, Helsinki.csv")

data.head()

Unnamed: 0,Review_No,Page,Rating,Name,Country,Reviewed_Date,Room_Type,Nights,Check_In_Month,Travel_Type,Review_Details
0,1,1,10.0,Muhamad,Malaysia,Reviewed: 3 December 2021,Superior Room,1,November 2021,Group,A very pleasant stay Liked · Nice place to...
1,2,1,9.0,Nadiya,Czech Republic,Reviewed: 27 February 2023,Standard Room,1,February 2023,Couple,Superb Liked · We had a lovely room with h...
2,3,1,9.0,Alexandre,Brazil,Reviewed: 26 February 2023,Standard Room,3,February 2023,Family,"Superb Liked · Staff is very nice, room wa..."
3,4,1,8.0,Helen,United Kingdom,Reviewed: 21 February 2023,Standard Room,3,February 2023,Group,"Fantastic Liked · Fantastic location, shor..."
4,5,1,9.0,Asia,United Kingdom,Reviewed: 17 February 2023,Standard Room,1,February 2023,Solo traveller,Nice clean hotel with friendly people Liked...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5885 entries, 0 to 5884
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Review_No       5885 non-null   int64  
 1   Page            5885 non-null   int64  
 2   Rating          5885 non-null   float64
 3   Name            5885 non-null   object 
 4   Country         5876 non-null   object 
 5   Reviewed_Date   5885 non-null   object 
 6   Room_Type       5472 non-null   object 
 7   Nights          5885 non-null   int64  
 8   Check_In_Month  5885 non-null   object 
 9   Travel_Type     5885 non-null   object 
 10  Review_Details  5885 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 505.9+ KB


# 1. Data Cleaning and Data Preprocessing

In [6]:
data = data.dropna()

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5464 entries, 0 to 5692
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Review_No       5464 non-null   int64  
 1   Page            5464 non-null   int64  
 2   Rating          5464 non-null   float64
 3   Name            5464 non-null   object 
 4   Country         5464 non-null   object 
 5   Reviewed_Date   5464 non-null   object 
 6   Room_Type       5464 non-null   object 
 7   Nights          5464 non-null   int64  
 8   Check_In_Month  5464 non-null   object 
 9   Travel_Type     5464 non-null   object 
 10  Review_Details  5464 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 512.2+ KB


In [8]:
data_mm=data.drop(["Review_No","Page","Name","Reviewed_Date","Review_Details"],axis=1)

In [9]:
data_mm.head()

Unnamed: 0,Rating,Country,Room_Type,Nights,Check_In_Month,Travel_Type
0,10.0,Malaysia,Superior Room,1,November 2021,Group
1,9.0,Czech Republic,Standard Room,1,February 2023,Couple
2,9.0,Brazil,Standard Room,3,February 2023,Family
3,8.0,United Kingdom,Standard Room,3,February 2023,Group
4,9.0,United Kingdom,Standard Room,1,February 2023,Solo traveller


### 1.1 Preprocessing the categorical value in Country

In [10]:
data_mm.Country.value_counts()

Finland               2761
Germany                409
Estonia                276
Russia                 191
Italy                  154
                      ... 
Abkhazia, ​Georgia       1
Uzbekistan               1
Peru                     1
Madagascar               1
Reunion                  1
Name: Country, Length: 91, dtype: int64

In [11]:
# 2. based on value_counts()

filter = data_mm.Country.value_counts()
data_mm.loc[data_mm.Country.isin(filter[filter<300].index),"Country"] = "Others"

data_mm.Country.value_counts()

Finland    2761
Others     2294
Germany     409
Name: Country, dtype: int64

In [12]:
data_mm.Country.value_counts()

Finland    2761
Others     2294
Germany     409
Name: Country, dtype: int64

### 1.2 Preprocessing the categorical value in Check_In_Month

In [13]:
data_mm["Check_In_Month"]=data_mm["Check_In_Month"].map(lambda x: x[:-5])
data_mm["Check_In_Month"]=data_mm["Check_In_Month"].str.replace("\xa0","",regex=True)
data_mm.head()

Unnamed: 0,Rating,Country,Room_Type,Nights,Check_In_Month,Travel_Type
0,10.0,Others,Superior Room,1,November,Group
1,9.0,Others,Standard Room,1,February,Couple
2,9.0,Others,Standard Room,3,February,Family
3,8.0,Others,Standard Room,3,February,Group
4,9.0,Others,Standard Room,1,February,Solo traveller


### 1.3 Preprocessing the categorical value in Nights

In [14]:
data_mm.Nights.value_counts()

1     3206
2     1413
3      486
4      210
5       80
6       29
7       25
8        7
9        2
11       2
12       2
17       1
16       1
Name: Nights, dtype: int64

In [15]:
# 2. Nights 

filter_nights = data_mm.Nights.value_counts()
data_mm.loc[data_mm.Nights.isin(filter_nights[filter_nights<1000].index),"Nights"] = "More than 2 nights"
data_mm.loc[data_mm.Nights.isin(filter_nights[filter_nights>3000].index),"Nights"] = "1 night"

data_mm.loc[data_mm.Nights.isin(filter_nights[filter_nights==1413].index),"Nights"] = "2 nights"



In [16]:
data_mm.Nights.value_counts()

1 night               3206
2 nights              1413
More than 2 nights     845
Name: Nights, dtype: int64

### 1.4 Preprocessing the value of target variable Rating, Convert to categorical 

In [17]:
data_mm.head()

Unnamed: 0,Rating,Country,Room_Type,Nights,Check_In_Month,Travel_Type
0,10.0,Others,Superior Room,1 night,November,Group
1,9.0,Others,Standard Room,1 night,February,Couple
2,9.0,Others,Standard Room,More than 2 nights,February,Family
3,8.0,Others,Standard Room,More than 2 nights,February,Group
4,9.0,Others,Standard Room,1 night,February,Solo traveller


In [18]:
bins = [0,8.4,10.0]
class_name=[0,1]
rating_cat=pd.cut(data_mm.Rating,bins,labels=class_name)
data_mm["rating_cat"]=rating_cat

In [19]:
data_mm=data_mm.drop(["Rating"],axis=1)


In [20]:
data_mm.head()

Unnamed: 0,Country,Room_Type,Nights,Check_In_Month,Travel_Type,rating_cat
0,Others,Superior Room,1 night,November,Group,1
1,Others,Standard Room,1 night,February,Couple,1
2,Others,Standard Room,More than 2 nights,February,Family,1
3,Others,Standard Room,More than 2 nights,February,Group,0
4,Others,Standard Room,1 night,February,Solo traveller,1


In [22]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-1.7.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4
Note: you may need to restart the kernel to use updated packages.


In [23]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer


In [2]:
# 2. Generate all possible feature subsets


0.4766355140186916

In [28]:
import itertools

# Get all possible subset of categorical columns to get_dummies.
cvar_list=["Country","Room_Type","Travel_Type","Check_In_Month","Nights"]
cvar_list_subsets = []
for i in range(0, len(cvar_list) + 1):
    cvar_list_subsets += [list(x) for x in itertools.combinations(cvar_list, i)]

# print the subsets
print(cvar_list_subsets)
print (len(cvar_list_subsets))

[[], ['Country'], ['Room_Type'], ['Travel_Type'], ['Check_In_Month'], ['Nights'], ['Country', 'Room_Type'], ['Country', 'Travel_Type'], ['Country', 'Check_In_Month'], ['Country', 'Nights'], ['Room_Type', 'Travel_Type'], ['Room_Type', 'Check_In_Month'], ['Room_Type', 'Nights'], ['Travel_Type', 'Check_In_Month'], ['Travel_Type', 'Nights'], ['Check_In_Month', 'Nights'], ['Country', 'Room_Type', 'Travel_Type'], ['Country', 'Room_Type', 'Check_In_Month'], ['Country', 'Room_Type', 'Nights'], ['Country', 'Travel_Type', 'Check_In_Month'], ['Country', 'Travel_Type', 'Nights'], ['Country', 'Check_In_Month', 'Nights'], ['Room_Type', 'Travel_Type', 'Check_In_Month'], ['Room_Type', 'Travel_Type', 'Nights'], ['Room_Type', 'Check_In_Month', 'Nights'], ['Travel_Type', 'Check_In_Month', 'Nights'], ['Country', 'Room_Type', 'Travel_Type', 'Check_In_Month'], ['Country', 'Room_Type', 'Travel_Type', 'Nights'], ['Country', 'Room_Type', 'Check_In_Month', 'Nights'], ['Country', 'Travel_Type', 'Check_In_Month',

In [25]:

# build XGboost models to try all possible categorical columns combination together with column Nights.  

n=0

for e in cvar_list_subsets[1:]:
    n+=1
    print ("The ", n, "Model. Predictors include categorical variables",e)
    data_m_c = data_mm.copy()
    
    # 
    for var in e:        
        dummies_set = pd.get_dummies(data_m_c[var], drop_first=True, prefix = var)
        data_m_c = pd.concat([data_m_c, dummies_set], axis = 1).drop(var, axis = 1)
    # print (data_m_c.columns.values)
    cvar_list_r = [x for x in cvar_list if x not in e]
    # print (cvar_list_r)
    data_m_c.drop(cvar_list_r,axis=1,inplace = True)
    # print (data_m_c.head())

    # split data for with all columns. 

    col_list = list(data_m_c.columns)
    col_list.remove("rating_cat")
    X = data_m_c[col_list]
    y = data_m_c["rating_cat"]


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    
    # Optimize the xgboost model.

    # Define the hyperparameter grid to search
    param_grid = {
        'max_depth': [10,20,30],
        'n_estimators': [100,200],
        'learning_rate': [0.2,0.4,0.6],
        'subsample': [0.5,0.9],
        'colsample_bytree': [0.8,1.0],
        'objective': ['binary:logistic']
    }

    # Create an XGBoost classifier
    xgb_model = xgb.XGBClassifier()

    # define recall for class 0.
    def TN_score (y_test, y_pred):
        cm = confusion_matrix(y_test, y_pred)
        # tn, fp, fn, tp = cm.ravel()
        recall_0 = cm[0][0]/(cm[0][0]+cm[0][1])
        tn_fn_ratio = cm[0][0]/ cm[1][0]
        return recall_0*tn_fn_ratio
    
    my_scorer = make_scorer(TN_score, greater_is_better=True)



    # Perform grid search on the classifier
    grid_search = GridSearchCV(xgb_model, param_grid, scoring=my_scorer, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    print(classification_report(y_test,y_pred))
    cm = confusion_matrix(y_test, y_pred)
    
    print ("best score:" , (cm[0][0]/(cm[0][0]+cm[0][1]))*(cm[0][0]/ cm[1][0]) )

    print ("\n")
    print("Confusion Matrix:")
    print(cm)
    print ("\n")
    print ("*"*60)
    




The  1 Model. Predictors include categorical variables
Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 59.88%
              precision    recall  f1-score   support

           0       0.58      0.09      0.15       337
           1       0.60      0.95      0.74       483

    accuracy                           0.60       820
   macro avg       0.59      0.52      0.45       820
weighted avg       0.59      0.60      0.50       820

best score: 0.12139196115457243


Confusion Matrix:
[[ 30 307]
 [ 22 461]]


************************************************************
The  2 Model. Predictors include categorical variables












 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 58.90%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       337
           1       0.59      1.00      0.74       483

    accuracy                           0.59       820
   macro avg       0.29      0.50      0.37       820
weighted avg       0.35      0.59      0.44       820

best score: nan


Confusion Matrix:
[[  0 337]
 [  0 483]]


************************************************************
The  3 Model. Predictors include categorical variables


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  print ("best score:" , (cm[0][0]/(cm[0][0]+cm[0][1]))*(cm[0][0]/ cm[1][0]) )












 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 58.90%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       337
           1       0.59      1.00      0.74       483

    accuracy                           0.59       820
   macro avg       0.29      0.50      0.37       820
weighted avg       0.35      0.59      0.44       820

best score: nan


Confusion Matrix:
[[  0 337]
 [  0 483]]


************************************************************
The  4 Model. Predictors include categorical variables


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  print ("best score:" , (cm[0][0]/(cm[0][0]+cm[0][1]))*(cm[0][0]/ cm[1][0]) )






 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 58.90%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       337
           1       0.59      1.00      0.74       483

    accuracy                           0.59       820
   macro avg       0.29      0.50      0.37       820
weighted avg       0.35      0.59      0.44       820

best score: nan


Confusion Matrix:
[[  0 337]
 [  0 483]]


************************************************************
The  5 Model. Predictors include categorical variables


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  print ("best score:" , (cm[0][0]/(cm[0][0]+cm[0][1]))*(cm[0][0]/ cm[1][0]) )
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
        nan 0.20397107 0.20397107 0.20397107        nan 0.20397107
 0.20397107 0.20397107        nan 0.20397107 0.20397107 0.20397107
        nan 0.20397107 0.20397107 0.20397107        nan 0.20397107
 0.20397107 0.20397107        nan 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107
 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107 0.20397107]


Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 57.68%
              precision    recall  f1-score   support

           0       0.46      0.17      0.25       337
           1       0.60      0.86      0.71       483

    accuracy                           0.58       820
   macro avg       0.53      0.52      0.48       820
weighted avg       0.54      0.58      0.52       820

best score: 0.14389476947606183


Confusion Matrix:
[[ 57 280]
 [ 67 416]]


************************************************************
The  6 Model. Predictors include categorical variables
Best Hyperparameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 59.51%
              precision    recall  f1-score   support

           0       0.56      0.07      0.12       337
           1       

 0.14269398        nan        nan        nan 0.14269398        nan
        nan        nan 0.09461296        nan        nan        nan
 0.09461296        nan        nan        nan 0.09461296        nan
        nan 0.16669034 0.1208194         nan        nan 0.16669034
 0.1208194         nan        nan 0.16669034 0.1208194         nan
        nan 0.1521462  0.1255352         nan        nan 0.1521462
 0.1255352         nan        nan 0.1521462  0.1255352         nan
        nan 0.16669034 0.04332645        nan        nan 0.16669034
 0.04332645        nan        nan 0.16669034 0.04332645        nan
        nan 0.16669034 0.06906017        nan        nan 0.16669034
 0.06906017        nan        nan 0.16669034 0.06906017        nan]


Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.6, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.9}
Accuracy: 58.54%
              precision    recall  f1-score   support

           0       0.20      0.00      0.01       337
           1       0.59      0.99      0.74       483

    accuracy                           0.59       820
   macro avg       0.39      0.50      0.37       820
weighted avg       0.43      0.59      0.44       820

best score: 0.000741839762611276


Confusion Matrix:
[[  1 336]
 [  4 479]]


************************************************************
The  11 Model. Predictors include categorical variables
Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.6, 'max_depth': 20, 'n_estimators': 200, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 59.39%
              precision    recall  f1-score   support

           0       0.51      0.33      0.40       337
           1     

Best Hyperparameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.4, 'max_depth': 20, 'n_estimators': 200, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 56.34%
              precision    recall  f1-score   support

           0       0.46      0.37      0.41       337
           1       0.61      0.70      0.65       483

    accuracy                           0.56       820
   macro avg       0.54      0.53      0.53       820
weighted avg       0.55      0.56      0.55       820

best score: 0.3146628466182339


Confusion Matrix:
[[124 213]
 [145 338]]


************************************************************
The  23 Model. Predictors include categorical variables
Best Hyperparameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.6, 'max_depth': 10, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.9}
Accuracy: 58.17%
              precision    recall  f1-score   support

           0       0.48      0.18      0.26       337
           1       