## Modeling
Preprocess:

Missing Value
One Hot Encoding
Binary Encoding
TfIdf
Count Vectorizer# Study case: Hotel Bookings

In [None]:
host = ['host_since_duration int','host_is_superhost',
        'host_neighbourhood','host_total_listings_count',
        'host_has_profile_pic','host_identity_verified',
        'calculated_host_listings_count','calculated_host_listings_count_entire_homes',
        'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']

location = ['neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude']

Types = ['property_type new','room_type','bed_type']

guest_people = ['guests_included','extra_people']

maximum_minimum = ['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                   'minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm']

availability = ['availability_30','availability_60','availability_90','availability_365']

reviews = ['first_review_duration int','first_review_duration int','review_scores_rating','review_scores_accuracy',
           'review_scores_cleanliness','review_scores_checkin','review_scores_communication',
           'review_scores_location','review_scores_value']

text = ['description10','amenities new']

var = host + location + Types + guest_people + maximum_minimum + availability + reviews

In [None]:
air_bnb[text].isnull().sum()

In [None]:
air_bnb[var].describe()

In [None]:
air_bnb[var].describe(include = object)

In [None]:
air_bnb_prep = air_bnb[(air_bnb['price'] != 0)&(air_bnb['price'] < 500)]

In [None]:
binary_encoder_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'NC')),
                                    ('binary encoder',ce.BinaryEncoder())
])

one_hot_encoder_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'NC')),
                                    ('one hot encoder',OneHotEncoder(handle_unknown='error'))
])

text_pipeline = Pipeline([
                                    ('tfidf',TfidfVectorizer())
])

tags_pipeline = Pipeline([
                                    ('countvectorizer',CountVectorizer())
])

numeric_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'median'))
])

reviews_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'constant',fill_value = -1))
])

binary_feature = ['host_is_superhost','host_has_profile_pic','host_identity_verified','neighbourhood_cleansed new','property_type new']
one_hot_feature = ['host_neighbourhood','neighbourhood_group_cleansed','room_type','bed_type']
numeric_feature = ['host_since_duration int','host_total_listings_count','calculated_host_listings_count',
                   'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
                   'calculated_host_listings_count_shared_rooms',
                   'latitude','longitude',
                   'guests_included','extra_people',
                   'minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                   'minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm',
                   'availability_30','availability_60','availability_90','availability_365']
reviews_feature = ['review_scores_rating','review_scores_accuracy',
                   'review_scores_cleanliness','review_scores_checkin','review_scores_communication',
                   'review_scores_location','review_scores_value']
text = ['description10','amenities new']


transformer = ColumnTransformer([
                                 ('one hot encoder', one_hot_encoder_pipeline, binary_feature),
                                 ('binary encoder', binary_encoder_pipeline, one_hot_feature),
                                 ('numerical', numeric_pipeline, numeric_feature),
                                 ('reviews', reviews_pipeline, reviews_feature),
                                 ('text',TfidfVectorizer(),'description10'),
                                 ('tags',CountVectorizer(),'amenities new')
                                ])

In [None]:
print('number of features used',len(binary_feature+one_hot_feature+numeric_feature+reviews_feature))

## Data Splitting

In [None]:
X = air_bnb_prep
y = air_bnb_prep['price']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,
    y,
    random_state = 42)

## Modeling Benchmark

In [None]:
transformer.fit(X_train_val)
X_train_val_preprocessed = transformer.transform(X_train_val)
X_test_preprocessed = transformer.transform(X_test)

In [None]:
model = XGBRegressor(    
    n_estimators = 200,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

In [None]:
model.fit(X_train_val_preprocessed, y_train_val)

In [None]:
y_pred = model.predict(X_test_preprocessed)

In [None]:
def evaluation(model, X_train, X_test, y_train, y_test):
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    print('R2 test:',r2_score(y_test, y_pred_test))
    print('R2 train',r2_score(y_train, y_pred_train))
    sns.scatterplot(y_pred,y_test - y_pred)

In [None]:
evaluation(model, X_train_val_preprocessed, X_test_preprocessed, y_train_val, y_test)

## Model Selection

In [None]:
model_tree = DecisionTreeRegressor(max_depth = 3)

model_ada = AdaBoostRegressor(
    model_tree,
    n_estimators = 200,
    learning_rate = 0.1,
    random_state = 10
)

model_rf = RandomForestRegressor(    
    n_estimators = 200,
    max_depth = 3,
    random_state = 42
)

model_gbc = GradientBoostingRegressor(    
    n_estimators = 200,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

model_xgb = XGBRegressor(    
    n_estimators = 200,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

estimator = Pipeline([
                      ('preprocess',transformer),
                      ('clf',model)                 
])


hyperparam_space = {
    'clf':[model_tree, model_rf, model_ada, model_gbc, model_xgb]
}

grid_search = GridSearchCV(
    estimator, # model to tune
    param_grid = hyperparam_space, # hyperparameter space
#     cv = skfold, # evaluation method
    scoring = 'r2', # metrics
    n_jobs = -1 # use all cores
)

In [None]:
grid_search.fit(X_train_val, y_train_val)

In [None]:
print('best score',grid_search.best_score_)
print('best param',grid_search.best_params_)

In [None]:
cv_result_df = pd.DataFrame(grid_search.cv_results_)

In [None]:
cv_result_df['mean_test_score'] = cv_result_df['mean_test_score']*100 
cv_result_df[['param_clf','mean_test_score']]

In [None]:
cv_result_df[['mean_test_score']].to_csv('model.csv')

## Hyperparameter Tuning for The Selected Model

In [None]:
model_xgb = XGBRegressor(    
    n_estimators = 200,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

estimator = Pipeline([
                      ('preprocess',transformer),
                      ('clf',model)                 
])

hyperparam_space = [
                    {'clf__learning_rate':[0.1],'clf__n_estimators':[200],'clf__max_depth':[3,4,5]},
                    {'clf__learning_rate':[0.05],'clf__n_estimators':[400],'clf__max_depth':[3,4,5]},
                    {'clf__learning_rate':[0.01],'clf__n_estimators':[2000],'clf__max_depth':[3,4,5]},
                    {'clf__learning_rate':[0.005],'clf__n_estimators':[4000],'clf__max_depth':[3,4,5]}
]

grid_search_xgb = GridSearchCV(
    estimator, # model to tune
    param_grid = hyperparam_space, # hyperparameter space
#     cv = skfold, # evaluation method
    scoring = 'r2', # metrics
    n_jobs = -1 # use all cores
)

In [None]:
grid_search_xgb.fit(X_train_val, y_train_val)

In [None]:
print('best score',grid_search_xgb.best_score_)
print('best param',grid_search_xgb.best_params_)

## Comparison Before After Hyperparameter Tuning

In [None]:
model_xgb = XGBRegressor(    
    n_estimators = 200,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

estimator = Pipeline([
                      ('preprocess',transformer),
                      ('clf',model)                 
])

In [None]:
estimator.fit(X_train_val, y_train_val)

In [None]:
evaluation(estimator, X_train_val, X_test, y_train_val, y_test)

In [None]:
grid_search_xgb.best_estimator_.fit(X_train_val, y_train_val)

In [None]:
evaluation(grid_search_xgb.best_estimator_, X_train_val, X_test, y_train_val, y_test)

## Final Model Evaluation

In [None]:
grid_search_xgb.best_estimator_.fit(X,y)

In [None]:
air_bnb_stay = air_bnb[air_bnb['availablity_22_30 indicator'] == 0]
air_bnb_lower = air_bnb[air_bnb['availablity_22_30 indicator'] == 1]

In [None]:
air_bnb_lower['minimum_nights'].value_counts()

In [None]:
air_bnb_lower['minimum_nights'] = np.where(air_bnb_lower['minimum_nights'] > 10,
                                           air_bnb_lower['minimum_nights'] - 3,
                                           air_bnb_lower['minimum_nights']) 

In [None]:
air_bnb_lower['availability_30'] = air_bnb_lower['availability_30'] - 10
air_bnb_lower['availability_30'].value_counts()

In [None]:
price_min = air_bnb_lower['price']*0.8
air_bnb_lower['price'] = grid_search_xgb.best_estimator_.predict(air_bnb_lower)
air_bnb_lower['price'] = np.where(air_bnb_lower['price']<price_min, price_min,air_bnb_lower['price'])

In [None]:
air_bnb_new = air_bnb_stay.append(air_bnb_lower)

In [None]:
current_omzet_30 = np.sum(air_bnb_new ['price']*(30-air_bnb_new ['availability_30']))
print(current_omzet_30)

In [None]:
max_omzet_30 = np.sum(air_bnb_new['price']*30)
print(max_omzet_30)

In [None]:
print(current_omzet_30*100/max_omzet_30)

In [None]:
labels = 'Current Potential Income','Current Loss Income'
sizes = [current_omzet_30,max_omzet_30-current_omzet_30]
explode = (0, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()