In [4]:
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split,GridSearchCV
#from skopt import BayesSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
import joblib
import pandas as pd
import numpy as np  

In [6]:
Tourism_df = pd.read_csv('Dataset_Tourism_final.csv')

In [7]:
# Selecting Relevant Feature for model training include 

# Select Features
selected_features = ["VisitYear","VisitMonth","VisitModeName","AttractionId","Attraction","AttractionType","CountryId","RegionId"]
# ENCODING CATEGORICAL FEATURES USING ONE-HOT ENCODING AND TARGET ENCODING  
categorical_features = ["VisitModeName", "AttractionType"]

df_selected = Tourism_df[selected_features + ["Rating"]].copy()

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = ohe.fit_transform(df_selected[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))
# CONVERTING BOOLEAN COLUMNS TO INTEGER TYPE FOR COMPATIBILITY  
bool_cols = df_selected.select_dtypes(include=["bool"]).columns
df_selected[bool_cols] = df_selected[bool_cols].astype(int)

# DROPPING ORIGINAL CATEGORICAL COLUMNS AFTER ENCODING AND CONCATENATING ENCODED FEATURES  
target_enc = TargetEncoder()
df_selected["Attraction"] = target_enc.fit_transform(df_selected["Attraction"], df_selected["Rating"])

df_selected = df_selected.drop(columns=categorical_features)
df_selected = pd.concat([df_selected, encoded_df], axis=1)

In [8]:
#Selecting Features (X) any target (Y)
# SEPARATING FEATURES (X) AND TARGET VARIABLE (Y) FOR MODEL TRAINING  
X = df_selected.drop(columns=["Rating"])
y = df_selected["Rating"]

In [9]:
# APPLYING STANDARD SCALER TO NORMALIZE THE FEATURE MATRIX (X) FOR BETTER MODEL PERFORMANCE  
scaler = StandardScaler()
x = scaler.fit_transform(X)

In [10]:
# SPLITTING THE DATA INTO TRAINING (80%) AND TESTING (20%) SETS WITH A FIXED RANDOM STATE FOR REPRODUCIBILITY  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
# Hyperparameter Tuning
param_grid = {
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 5]
}

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
best_dt = grid_search.best_estimator_

# ROUNDING PREDICTIONS TO THE NEAREST INTEGER AND CLIPPING VALUES TO ENSURE THEY FALL WITHIN THE VALID RATING RANGE (1 TO 5)  
# CALCULATING MEAN ABSOLUTE ERROR (MAE) AND ROOT MEAN SQUARED ERROR (RMSE) TO EVALUATE MODEL PERFORMANCE  
dt_pred = best_dt.predict(X_test)
dt_pred = np.round(dt_pred).astype(int)
dt_pred = np.clip(dt_pred, 1, 5)

# Evaluation Metrics
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

print(f"Optimized Decision Tree Regressor → MAE: {dt_mae:.2f}, RMSE: {dt_rmse:.2f}")

Optimized Decision Tree Regressor → MAE: 0.67, RMSE: 0.93


In [13]:
# PERFORMING RANDOMIZED SEARCH CROSS-VALIDATION TO FIND THE BEST HYPERPARAMETERS FOR THE RANDOM FOREST REGRESSOR  
# SEARCHING OVER DIFFERENT VALUES FOR NUMBER OF ESTIMATORS, MAX DEPTH, MIN SAMPLES SPLIT, MIN SAMPLES LEAF, AND MAX FEATURES  
# SELECTING THE BEST MODEL BASED ON MINIMIZING MEAN ABSOLUTE ERROR (MAE)  
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=5, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

best_rf = rf_random.best_estimator_

# EVALUATING THE BEST RANDOM FOREST MODEL WITH ROUNDED AND CLIPPED PREDICTIONS  
rf_pred = best_rf.predict(X_test)
rf_pred = np.round(rf_pred).astype(int)
rf_pred = np.clip(rf_pred, 1, 5) 

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest Regressor → MAE: {rf_mae}, RMSE: {rf_rmse}")

Random Forest Regressor → MAE: 0.6719251842055545, RMSE: 0.9318857810976099


In [14]:
# PERFORMING RANDOMIZED SEARCH TO TUNE XGBOOST HYPERPARAMETERS AND SELECT THE BEST MODEL  
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 10],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(XGBRegressor(random_state=42), param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_

# EVALUATING THE TUNED XGBOOST MODEL USING MAE AND RMSE  
# Predictions
xgb_pred = best_xgb.predict(X_test)
xgb_pred = np.round(xgb_pred).astype(int)
xgb_pred = np.clip(xgb_pred, 1, 5)

# Evaluation Metrics
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))

print(f"Optimized XGBoost → MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}")

Optimized XGBoost → MAE: 0.67, RMSE: 0.93


In [16]:
# Saving model
joblib.dump(target_enc,'Target Encoder Model(regression).pkl')
print("Target encoded model saved")

joblib.dump(ohe,'One-Hot Endcoder Model(regression).pkl')
print("One-Hot Encoded model saved")

joblib.dump(scaler,'Scaler(regression).pkl')
print("Scaler has been saved")

joblib.dump(best_xgb,'XGBoost model(regression).pkl')
print("Best XGBoost Model has been saved")

Target encoded model saved
One-Hot Encoded model saved
Scaler has been saved
Best XGBoost Model has been saved


In [1]:
import xgboost
print(xgboost.__version__)

3.0.2


In [2]:
import xgboost
print(xgboost.__version__)

3.0.2
