In [1]:
# IMPORTING PANDAS AND NUMPY FOR DATA MANIPULATION AND NUMERICAL COMPUTATIONS  
import pandas as pd 
import numpy as np

In [3]:
# LOADING THE TOURISM DATASET FROM A CSV FILE INTO A DATAFRAME  
Tourism_df = pd.read_csv(r"D:\1final ds\final_ds2.csv")

In [None]:
# DISPLAYING THE FIRST FIVE ROWS OF THE TOURISM DATAFRAME  
Tourism_df.head()

In [None]:
# CHECKING FOR MISSING VALUES IN EACH COLUMN OF THE TOURISM DATAFRAME  
Tourism_df.isna().sum()

In [6]:
# COUNTING THE NUMBER OF DUPLICATE ROWS IN THE TOURISM DATAFRAME  
Tourism_df.duplicated().sum()

np.int64(0)

In [None]:
pip install xgboost

In [9]:
# IMPORTING REGRESSION MODELS (XGBOOST, DECISION TREE, RANDOM FOREST) AND RANDOMIZED SEARCH FOR HYPERPARAMETER TUNING  
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [11]:
# IMPORTING FUNCTIONS FOR DATA SPLITTING, MODEL EVALUATION METRICS (MAE, MSE), AND OTHER PERFORMANCE MEASURES  
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# INSTALLING THE CATEGORY_ENCODERS LIBRARY FOR ENCODING CATEGORICAL VARIABLES  
pip install category_encoders

In [13]:
# IMPORTING TARGET ENCODER FOR HANDLING CATEGORICAL VARIABLES, ONE-HOT ENCODER FOR CREATING DUMMY VARIABLES,  
# AND STANDARD SCALER FOR FEATURE SCALING  
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [14]:
# SELECTING RELEVANT FEATURES FOR MODEL TRAINING, INCLUDING VISIT DETAILS, ATTRACTION INFORMATION, AND LOCATION DATA  
selected_features = [
    "VisitYear",        
    "VisitMonth",       
    "VisitModeName",    
    "AttractionId",     
    "Attraction",      
    "AttractionType",  
    "CountryId",        
    "RegionId"          
]


In [15]:

# ENCODING CATEGORICAL FEATURES USING ONE-HOT ENCODING AND TARGET ENCODING  
# CONVERTING BOOLEAN COLUMNS TO INTEGER TYPE FOR COMPATIBILITY  
# DROPPING ORIGINAL CATEGORICAL COLUMNS AFTER ENCODING AND CONCATENATING ENCODED FEATURES  
categorical_features = ["VisitModeName", "AttractionType"]

df_selected = Tourism_df[selected_features + ["Rating"]].copy()

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = ohe.fit_transform(df_selected[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

bool_cols = df_selected.select_dtypes(include=["bool"]).columns
df_selected[bool_cols] = df_selected[bool_cols].astype(int)

target_enc = TargetEncoder()
df_selected["Attraction"] = target_enc.fit_transform(df_selected["Attraction"], df_selected["Rating"])

df_selected = df_selected.drop(columns=categorical_features)
df_selected = pd.concat([df_selected, encoded_df], axis=1)


In [16]:
# SEPARATING FEATURES (X) AND TARGET VARIABLE (Y) FOR MODEL TRAINING  
X = df_selected.drop(columns=["Rating"])
y = df_selected["Rating"]

In [None]:
# DISPLAYING THE FIRST FIVE ROWS OF THE FEATURE MATRIX (X)  
X.head()

In [18]:
# APPLYING STANDARD SCALER TO NORMALIZE THE FEATURE MATRIX (X) FOR BETTER MODEL PERFORMANCE  
scaler = StandardScaler()
x = scaler.fit_transform(X)

In [19]:
# SPLITTING THE DATA INTO TRAINING (80%) AND TESTING (20%) SETS WITH A FIXED RANDOM STATE FOR REPRODUCIBILITY  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [20]:
# TRAINING A DECISION TREE REGRESSOR WITH A MAX DEPTH OF 5 AND MAKING PREDICTIONS ON THE TEST SET  
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [21]:
# ROUNDING PREDICTIONS TO THE NEAREST INTEGER AND CLIPPING VALUES TO ENSURE THEY FALL WITHIN THE VALID RATING RANGE (1 TO 5)  
# CALCULATING MEAN ABSOLUTE ERROR (MAE) AND ROOT MEAN SQUARED ERROR (RMSE) TO EVALUATE MODEL PERFORMANCE  
dt_pred = np.round(dt_pred).astype(int)
dt_pred = np.clip(dt_pred, 1, 5) 

dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

print(f"Decision Tree Regressor → MAE: {dt_mae}, RMSE: {dt_rmse}")

Decision Tree Regressor → MAE: 0.6875118080483658, RMSE: 0.9525885264963009


In [None]:
# TRAINING A RANDOM FOREST REGRESSOR WITH 50 TREES AND A MAX DEPTH OF 5 FOR PREDICTIVE MODELING  
rf_model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# ROUNDING AND CLIPPING RANDOM FOREST PREDICTIONS TO ENSURE THEY FALL WITHIN THE VALID RATING RANGE (1 TO 5)  
# CALCULATING MAE AND RMSE TO EVALUATE THE PERFORMANCE OF THE RANDOM FOREST REGRESSOR  
rf_pred = rf_model.predict(X_test)
rf_pred = np.round(rf_pred).astype(int)
rf_pred = np.clip(rf_pred, 1, 5) 

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest Regressor → MAE: {rf_mae}, RMSE: {rf_rmse}")

In [24]:
# PERFORMING RANDOMIZED SEARCH CROSS-VALIDATION TO FIND THE BEST HYPERPARAMETERS FOR THE RANDOM FOREST REGRESSOR  
# SEARCHING OVER DIFFERENT VALUES FOR NUMBER OF ESTIMATORS, MAX DEPTH, MIN SAMPLES SPLIT, MIN SAMPLES LEAF, AND MAX FEATURES  
# SELECTING THE BEST MODEL BASED ON MINIMIZING MEAN ABSOLUTE ERROR (MAE)  
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=5, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

best_rf = rf_random.best_estimator_

In [25]:
# EVALUATING THE BEST RANDOM FOREST MODEL WITH ROUNDED AND CLIPPED PREDICTIONS  
rf_pred = best_rf.predict(X_test)
rf_pred = np.round(rf_pred).astype(int)
rf_pred = np.clip(rf_pred, 1, 5) 

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest Regressor → MAE: {rf_mae}, RMSE: {rf_rmse}")

Random Forest Regressor → MAE: 0.6868505573398829, RMSE: 0.9480657369624302


In [None]:
# TRAINING AN XGBOOST REGRESSOR WITH 100 ESTIMATORS, LEARNING RATE OF 0.1, AND MAX DEPTH OF 6  
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)


In [None]:
# MAKING PREDICTIONS WITH XGBOOST, ROUNDING AND CLIPPING THEM TO VALID RATING RANGE (1 TO 5)  
# CALCULATING MAE AND RMSE TO EVALUATE MODEL PERFORMANCE  
y_pred_xgb = model.predict(X_test)

dt_pred = np.round(y_pred_xgb).astype(int)
dt_pred = np.clip(dt_pred, 1, 5) 
print(dt_pred)

dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

print(f"Xgboost Regressor → MAE: {dt_mae}, RMSE: {dt_rmse}")

In [28]:
# PERFORMING RANDOMIZED SEARCH TO TUNE XGBOOST HYPERPARAMETERS AND SELECT THE BEST MODEL  
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb = XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=20, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_

In [29]:
# EVALUATING THE TUNED XGBOOST MODEL USING MAE AND RMSE  
y_pred = best_xgb.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"XGBoost Regressor (Tuned) → MAE: {mae:.3f}, RMSE: {rmse:.3f}")

XGBoost Regressor (Tuned) → MAE: 0.709, RMSE: 0.904


In [31]:
# IMPORTING JOBLIB FOR SAVING AND LOADING MODELS  
import joblib

In [None]:
# SAVING THE TARGET ENCODER MODEL FOR FUTURE USE  
joblib.dump(target_enc, r"D:\1final ds\predrating_target.pkl")


In [None]:
# SAVING THE STANDARD SCALER MODEL FOR FUTURE USE  
joblib.dump(scaler, r"D:\1final ds\predscalar.pkl")

In [None]:
# SAVING THE ONE-HOT ENCODER MODEL FOR FUTURE USE  
joblib.dump(ohe, r"D:\1final ds\preddump.pkl")

In [None]:
# SAVING THE BEST XGBOOST MODEL FOR TOURISM RECOMMENDATION  
joblib.dump(best_xgb, r"D:\1final ds\predxgb.pkl")