In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
)
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBRegressor

# ruff .toml
import pandas as pd
import numpy as np
import json

from rich import print
from rich.table import Table

In [22]:
with open("Booking_Hotels_Paris_cleaned.json", "r") as read_content:
    df = json.load(read_content)

df = pd.DataFrame(df)

In [23]:
df = df.dropna(axis=0)

In [8]:
summary = Table('Variable', 'type', 'NA')

In [9]:
for col in df.columns:
    NA = '/'
    if df[col].isnull().sum() != 0:
        NA = df[col].isnull().sum()
    summary.add_row(
        str(col),
        str(df[col].dtype),
        str(NA)
    )
summary

In [17]:
list_facilities = []
for i in range(0, len(df)):
    for keys, values in df["Hotel_facilities"][i].items():
        if keys not in list_facilities:
            list_facilities.append(keys)

df_updated = df.drop(
    [
        "Room_id",
        "Room_name",
        "Room_promo",
        "Room_breakfast",
        "Room_cancellation",
        "Room_prepayment",
        "Hotel_id",
        # "Hotel_name",
        "Hotel_Name",
        "Hotel_address",
        "Hotel_type",
        "Hotel_facilities",
        "Hotel_categories",
        "Hotel_Street",
        "Hotel_City",
        "Hotel_Country",
    ],
    axis=1,
)
for facility in list_facilities:
    df = df.drop(columns=[f"{facility}"])

y = df.Room_price
X = df.drop("Room_price", axis=1)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=85)

In [24]:
y = df["Room_price"]
X = df[
    ['Room_sleeps', 'Room_size', 'Hotel_grade', 'Hotel_nb_reviews',
       'Hotel_stars', 'Lift_bin', 'Luggage storage_bin',
       'No parking available._bin', 'Staff', 'Facilities', 'Cleanliness',
       'Comfort', 'Value for money', 'Location', 'Free WiFi', 'Room_promo_bin',
       'Room_cancellation_bin', 'Room_prepayment_bin', 'Room_breakfast_bin',
       'Room_breakfast_price'
    ]
]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=85)

In [25]:
X.columns

Index(['Room_sleeps', 'Room_size', 'Hotel_grade', 'Hotel_nb_reviews',
       'Hotel_stars', 'Lift_bin', 'Luggage storage_bin',
       'No parking available._bin', 'Staff', 'Facilities', 'Cleanliness',
       'Comfort', 'Value for money', 'Location', 'Free WiFi', 'Room_promo_bin',
       'Room_cancellation_bin', 'Room_prepayment_bin', 'Room_breakfast_bin',
       'Room_breakfast_price'],
      dtype='object')

In [27]:
lr = LinearRegression()
lr_final = lr.fit(X_tr, y_tr)

In [40]:
rfr = RandomForestRegressor()
rfr_gs = GridSearchCV(
    rfr,
    {
        "n_estimators": (16, 32, 64, 128, 256),
        "max_depth": (1, 10, 50, 100, None),
        "min_samples_leaf": (1, 2, 5, 10),
        "max_features" : ['auto', 'sqrt', 'log2', None],
    },
)
rfr_gs_final = rfr_gs.fit(X_tr, y_tr)

In [41]:
rfr_gs_final.best_score_, rfr_gs_final.best_params_

(0.9627612743059915,
 {'max_depth': 50, 'min_samples_leaf': 1, 'n_estimators': 128})

In [30]:
overview = Table(
    "Model",
    "Train score",
    "Mean CV score",
    "Dispersion CV score",
    "Best score",
    "Best params",
    title="Synthèse des modèles",
)

models = [
    lr_final,
    # gpr_final,
    rfr_gs_final,
    # svr_gs_final,
    # mlp_gs_final,
    # naive_gs_final,
    # log_gs_final,
    # xgb_gs_final,
    # gb_gs_final,
    # adb_gs_final,
]

for model in models:
    if "GridSearchCV" in str(model):
        model_name = str(model.estimator)
        best_score = model.best_score_
        best_params = model.best_params_
        if "Pipeline" in str(model):
            model_name = str(model.estimator[1])
    else:
        model_name = str(model)
        best_score = "/"
        best_params = "/"

    cv_scores = cross_val_score(model, X_tr, y_tr, cv=5)
    overview.add_row(
        str(model_name),
        str(model.score(X_tr, y_tr)),
        str(cv_scores.mean()),
        str(cv_scores.std()),
        str(best_score),
        str(best_params),
    )
print(overview)

value = 0
for row in range(0, overview.row_count):
    value = overview.columns[1]._cells
    if value[row] > value[row - 1]:
        value = value[row]
        best_row = row
best_model = models[best_row]
print(best_model)

In [39]:
type(best_model)

sklearn.model_selection._search.GridSearchCV

In [38]:
y_true, y_pred = y_te, rfr_gs_final.predict(X_te)

true = np.array(y_true)
pred = np.array(y_pred)
pred = np.around(pred, decimals=1)

ecart = list()
for i in range(len(true)):
    ecart.append(((pred[i] - true[i]) / true[i]) * 100)
ecart = np.around(ecart, decimals=2)

t = Table(
    "Réalité",
    "Prédiction",
    "Ecart",
    title="Résulat de la prédiction",
    show_header=True,
)
# for i in range(5):
#     t.add_row(
#         f"{true[i]:.2f}",
#         f"{pred[i]:.2f}",
#         f"{ecart[i]:.2f} %",
#     )
# x = df.index[df["Room_name"] == "Superior Double Room"].astype(int).tolist()
x = df.index[df["Room_id"] == "112030409_91947049_0_2_0"].astype(int).tolist()
x = df.index[df["Room_id"].str.contains("112030409")].astype(int).tolist()
list_id = ["112030409_91947049_0_2_0", "112030415_213798395_0_1_0"]
x = df.index[df["Room_id"] == list_id].astype(int).tolist()
for i in x:
    t.add_row(
        f"{true[i]:.2f}",
        f"{pred[i]:.2f}",
        f"{ecart[i]:.2f} %",
    )
print(t)

SyntaxError: invalid syntax (Temp/ipykernel_25048/690053080.py, line 29)

In [None]:
"112030409_91947049_0_2_0"