## Learning to Rank

Pointwise: One instance of the set is considered at a time, use any kind of classifier or regressor to predict how relevant it is in the current query. Use each points predicted relevance to order the set.

Pairwise: A pair of instances is chosen and the order of those two is predicted. Repeat this for each pair of the query to find the final order of the entire query.

Listwise: Many or all instances are considered at once. Try to find the optimal order.

- https://www.kaggle.com/code/prashant111/lightgbm-classifier-in-python
- https://www.kaggle.com/code/samratp/lightgbm-xgboost-catboost
- https://developer.nvidia.com/blog/learning-to-rank-with-xgboost-and-gpu/#:~:text=XGBoost%20is%20a%20widely%20used,descent%20using%20an%20objective%20function.
- https://medium.com/predictly-on-tech/learning-to-rank-using-xgboost-83de0166229d

In [92]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.Collecting catboost

  Downloading catboost-1.0.6-cp38-none-win_amd64.whl (73.9 MB)
Collecting plotly
  Downloading plotly-5.8.2-py2.py3-none-any.whl (15.2 MB)
Collecting graphviz
  Downloading graphviz-0.20-py3-none-any.whl (46 kB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly, graphviz, catboost
Successfully installed catboost-1.0.6 graphviz-0.20 plotly-5.8.2 tenacity-8.0.1


In [90]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'xgboost'

# Data Preparation (Pre-processing)

In [93]:
import pandas as pd

df = pd.read_csv('hotels3.csv')
df.head()
df_cp = df.copy()

In [None]:
facilities_columns = ['Food and Drinks','Hotel Services','In-room Facilities', 'Business Facilities', 'Nearby Facilities',
                      'Public Facilities', 'General', 'Things to Do', 'Accessibilty', 'Connectivity', 'Transportation',
                      'Kids and Pets', 'Sports and Recreations', 'Shuttle Service']
ind = []
list_subfacil = {}
for index, row in df_cp.iterrows():
    facils = row['Facil + Akomod'].splitlines()
    facils = [facil for facil in facils if facil != '']
    
    
    for facil in facils: 
        if facil in facilities_columns:
            main_facil = facil
            list_subfacil[main_facil] = []
        else:
            list_subfacil[main_facil] += [facil]
    ind.append(list_subfacil)

# print(ind)

In [None]:
for index, facil in enumerate(ind):
    for key, value in facil.items():
        df_cp.at[index, "{}_list".format(key)] = '\n'.join(value)

In [None]:
df_cp

In [None]:
df['Food and Drinks_list']

In [None]:
dicti = {'facil' : []}
dicti['facil'].append('ha')

dicti['facil']

In [99]:
facilities_columns = ['Food and Drinks','Hotel Services','In-room Facilities', 'Business Facilities', 'Nearby Facilities', 'Public Facilities', 'General', 'Things to Do', 'Accessibilty', 'Connectivity', 'Transportation', 'Kids and Pets', 'Sports and Recreations', 'Shuttle Service']

facilities_columns.reverse()

for index , row in df.iterrows():
    
    # split per fasil and akomod
    arr = row['Facil + Akomod'].splitlines() 
    arr = [ar for ar in arr if ar != '']
    #iterate over fasil and akomod

    i = 0
    count = 0

    for  item in reversed(arr):
      count += 1
      if item in facilities_columns:
        df.at[index,item ] = count-1
        count = 0
        i += 1

In [101]:
df['Shuttle Service']

0     1.0
1     NaN
2     1.0
3     NaN
4     1.0
     ... 
84    1.0
85    1.0
86    NaN
87    1.0
88    1.0
Name: Shuttle Service, Length: 89, dtype: float64

In [None]:
df = df.fillna(0)

In [94]:
for index , row in df.iterrows():
    
    # split per fasil and akomod
    arr = row['Places Nearby'].splitlines() 
    
    #iterate over fasil and akomod

    i = 0
    count = 0
    
    for ind, item in enumerate(arr):
        itemsplits = item.split()
        for x in itemsplits:
            if x.isdigit():
                if itemsplits[1] == "km":
                    meters = itemsplits[0] * 1000
                else:
                    meters = itemsplits[0]
                    
                if meters.isdigit():
                    df.at[index,arr[ind-1]] = meters
                    #print(meters)

In [95]:
print(df.columns)

Index(['Hotel', 'Star', 'Rating', 'Reviews', 'Harga', 'Places Nearby',
       'Facil + Akomod', 'Fast Food', 'Shop & Gifts', 'Business',
       'Transportation Hub', 'Casual Dining', 'Nightlife', 'Park & Zoo',
       'Public Service', 'Arts & Sciences', 'Fine Dining', 'Sport',
       'Quick Bites', 'Education', 'Street Food', 'Activity & Games', 'Cafe',
       'Entertainment', 'Food Court', 'Sight & Landmark'],
      dtype='object')


In [None]:
df.Harga = df['Harga'].str.replace('.','', regex = True)
df.Harga = df['Harga'].str.replace(',','.', regex = True)
df.Harga = df['Harga'].astype(float).astype(int)

df.Reviews = df['Reviews'].str.replace('.','', regex = True)
df.Reviews = df['Reviews'].str.replace(',','.', regex = True)
df.Reviews = df['Reviews'].astype(float).astype(int)

In [None]:
df = df.fillna(10000)

In [None]:
c = df.select_dtypes(object).columns
df[c] = df[c].apply(pd.to_numeric,errors='coerce')

In [None]:
df.dtypes

In [None]:
df.head()

# Modelling

## Declare feature vector and target variable

In [None]:
# view summary of dataset
df.info()

## Data Splitting

In [None]:
X = df[['Star','Reviews','Harga','Shuttle Service','Sports and Recreations', 'Kids and Pets', 'Transportation', 'Connectivity', 'Accessibilty', 'Things to Do', 'General', 'Public Facilities', 'Nearby Facilities', 'Business Facilities', 'In-room Facilities', 'Hotel Services', 'Food and Drinks', 'Fast Food', 'Shop & Gifts', 'Business', 'Transportation Hub', 'Casual Dining', 'Nightlife', 'Park & Zoo', 'Public Service', 'Arts & Sciences', 'Fine Dining', 'Sport', 'Quick Bites', 'Education', 'Street Food', 'Activity & Games', 'Cafe', 'Entertainment', 'Food Court', 'Sight & Landmark' ]]
y = df['Rating']

In [None]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
dev_X, val_X, dev_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Light GBM

In [None]:
#import lightgbm as lgb
#clf = lgb.lambdarank()
#clf.fit(X_train, y_train)

def run_lgb(X_train, X_test, y_train, y_test):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(X, label=y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result)
    
    #pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return model, evals_result #pred_test_y

In [None]:
model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y)
print("LightGBM Training Completed...")

In [None]:
y_pred = model.predict(X)
print(y_pred)

In [None]:
'''
# feature importance
print("Features Importance...")
gain = model.feature_importance('Rating')
featureimp = pd.DataFrame({'Rating':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'Rating':100 * gain / gain.sum()}).sort_values('Rating', ascending=False)
print(featureimp[:50])
'''

## XGBoost























In [None]:
def run_xgb(train_X, train_y, val_X, val_y):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    #dtest = xgb.DMatrix(test_X)
    #xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return  model_xgb #, xgb_pred_y,

In [None]:
# Training XGB
model_xgb = run_xgb(dev_X, dev_y, val_X, val_y)
print("XGB Training Completed...")

## Catboost

In [None]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [None]:
cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)

pred_test_cat = np.expm1(cb_model.predict(X))
print(pred_test_cat)

## Tensorflow Regression

In [None]:
df = df.drop(['Hotel', 'Places Nearby', 'Facil + Akomod'], axis=1)
df

In [None]:
df2 = df
df3 = df

## Data Preparation

In [None]:
train_dataset = df.loc[:70]
test_dataset = df.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('Rating')
test_labels = test_features.pop('Rating')

In [None]:
print(train_features.info)

# TENSORFLOW PIPELINE

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())

In [None]:
regularizer = 0.000001
dropout = 0
schedul = -0.0001
lr = 0.001

In [None]:
def mikro_model(norm):
  
    model = keras.Sequential([
      norm,
      layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2(regularizer) ),
      layers.Dropout(dropout),
      layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
      layers.Dropout(dropout),
      layers.Dense(1)
      ])
    
    return model



def small_model(norm):
  
    model = keras.Sequential([
      norm,
      layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(regularizer) ),
        layers.Dropout(dropout),
      layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(1)
      ])
    
    return model


def medium_model(norm):
  
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(regularizer) ),
      layers.Dropout(dropout),
      layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(regularizer) ),
      layers.Dropout(dropout),
      layers.Dense(1)
      ])
    
    return model


def large_model(norm):
  
    model = keras.Sequential([
      norm,
      layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(1)
      ])
    
    return model

def scale_model(norm):
  
    model = keras.Sequential([
      norm,
      layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
        layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
        layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
        layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
         layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(regularizer)),
        layers.Dropout(dropout),
      layers.Dense(1)
      ])
    
    return model

## OPTIMIZER

In [None]:
optimizer2 = tf.optimizers.Adam(learning_rate=lr)

## SCHEDULER

In [None]:
def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * tf.math.exp(schedul)

In [None]:
def DNN_Pipeline (model):
    
    model.compile(optimizer= optimizer2, loss='mean_absolute_error')
    
    history = model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    callbacks = tf.keras.callbacks.LearningRateScheduler(scheduler),
    verbose=0, epochs=100)
    
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 2])
    plt.xlabel('Epoch')
    plt.ylabel('Error [MPG]')
    plt.legend()
    plt.grid(True)
    print(hist.tail(10))
    
    return model

In [None]:
small = DNN_Pipeline(small_model(normalizer))

In [None]:
medium = DNN_Pipeline(medium_model(normalizer))

In [None]:
large = DNN_Pipeline(large_model(normalizer))

In [None]:
mikro = DNN_Pipeline(mikro_model(normalizer))

In [None]:
scale = DNN_Pipeline(scale_model(normalizer))

In [None]:
mikro_predict = mikro.predict(test_features)
small_predict = small.predict(test_features)
medium_predict = medium.predict(test_features)
large_predict = large.predict(test_features)
scale_predict = scale.predict(test_features)

In [None]:
mean_absolute_error(mikro_predict,test_labels)   

In [None]:
mean_absolute_error(small_predict,test_labels)   

In [None]:
mean_absolute_error(medium_predict,test_labels)   

In [None]:
mean_absolute_error(large_predict,test_labels)   

In [None]:
mean_absolute_error(scale_predict,test_labels)   

In [None]:
print(scale_predict)

In [None]:
print (test_labels)

In [None]:
print(medium.predict(train_features))

# RANGKING WITH REGRESSION

In [None]:
a = []

for x in range(1,90):
    a.append(x)
    
print(a)

df2 = df
df2['Rank'] = a

df2

In [None]:
train_dataset = df2.loc[:70]
test_dataset = df2.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('Rank')
test_labels = test_features.pop('Rank')

In [None]:
normalizer2 = tf.keras.layers.Normalization(axis=-1)
normalizer2.adapt(np.array(train_features))
print(normalizer.mean.numpy())

In [None]:
large = DNN_Pipeline(large_model(normalizer2))


In [None]:
scale = DNN_Pipeline(scale_model(normalizer2))


In [None]:
medium = DNN_Pipeline(medium_model(normalizer2))

In [None]:
df_rank = test_dataset
df_rank['score'] = medium.predict(test_features)
df_rank

In [None]:
new_sorted = df_rank.sort_values(by=['score'])

# RANk error per item (0 is perfect)

In [None]:
b = []

for x in range (72,90):
    b.append(x)

    
c = b-(new_sorted['Rank'])

print (c)

In [None]:
print(df_rank.index)

In [None]:
test_labels

## Rank Train Datasets

In [None]:
# Rank train datasets

medium.predict(train_features)

In [None]:
df_rank = train_dataset
df_rank['score'] = medium.predict(train_features)
df_rank
new_sorted2 = df_rank.sort_values(by=['score'])
new_sorted2

# RANk error per item (0 is perfect)

In [None]:
K = []

for x in range (1,72):
    K.append(x)

    
c = K-(new_sorted2['Rank'])

for x in c:
    print (x)

## Export the model

In [None]:
medium.save('TestV1.h5')