In [1]:
#IMPORTING LIBRARIES FOR HANDLING DATAFRAME AND FOR DATA VISUALIZATION
import pandas as pd 
import numpy as np

In [2]:
#READING THE CSV FILE
Tourism_df = pd.read_csv(r"D:\TRANSACTION PROJECT\Full Tourism Data.csv")

In [None]:
#DISPLAY THE TABLE
Tourism_df.head()

In [None]:
#CHECKING NULL VALUE
Tourism_df.isna().sum()

In [None]:
#CHECKING DUPLICATE VALUE
Tourism_df.duplicated().sum()

In [None]:
#IMPORTING LIBRARIES FOR TRAINING THE MODEL
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#IMPORTING LIBRARIES FOR SPLITING AND EVALUATING THE MODEL
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

In [None]:
#INSTALLING THE ENCODER FOR ENCODING
pip install category_encoders

In [None]:
#IMPORTING LIBRARIES FOR ENCODING 
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [7]:
#SELECT BEST FEATURE FOR TRAINING
selected_features = [
    "VisitYear",        
    "VisitMonth",       
    "VisitModeName",    
    "AttractionId",     
    "Attraction",      
    "AttractionType",  
    "CountryId",        
    "RegionId"          
]


In [10]:
#CONVERTING CATEGORICAL DATA TO NUMERIC DATA
categorical_features = ["VisitModeName", "AttractionType"]

df_selected = Tourism_df[selected_features + ["Rating"]].copy()

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = ohe.fit_transform(df_selected[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

bool_cols = df_selected.select_dtypes(include=["bool"]).columns
df_selected[bool_cols] = df_selected[bool_cols].astype(int)

target_enc = TargetEncoder()
df_selected["Attraction"] = target_enc.fit_transform(df_selected["Attraction"], df_selected["Rating"])

df_selected = df_selected.drop(columns=categorical_features)
df_selected = pd.concat([df_selected, encoded_df], axis=1)


In [11]:
#ASSIGNING X FOR FEATURE AND Y FOR TARGET
X = df_selected.drop(columns=["Rating"])
y = df_selected["Rating"]

In [None]:
#DISPLAY THE CONVERTED 
X.head()

In [13]:
#CONVERT THE BIG NUMERIC TO SMALL 
scaler = StandardScaler()
x = scaler.fit_transform(X)

In [14]:
#SPLITTING THE DATA FOR TRAINING AND TESTING
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#UPDATE THE SKLEARN LIBRARY
pip install --upgrade xgboost scikit-learn


In [None]:
#TRAINING THE DECISION TREE REGRESSOR
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [None]:
#EVALUATING THE DC TREE MODEL 
dt_pred = np.round(dt_pred).astype(int)
dt_pred = np.clip(dt_pred, 1, 5) 

dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

print(f"Decision Tree Regressor → MAE: {dt_mae}, RMSE: {dt_rmse}")

In [None]:
#TRAINING THE RANDOM FOREST REGRESSION MODEL
rf_model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
#EVALUATING THE RF MODEL 
rf_pred = rf_model.predict(X_test)
rf_pred = np.round(rf_pred).astype(int)
rf_pred = np.clip(rf_pred, 1, 5) 

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest Regressor → MAE: {rf_mae}, RMSE: {rf_rmse}")

In [None]:
#TRAINING THE RANDOM FOREST WITH RANDOMIZEDSEARCHCV HYPERPARAMETER TUNNING

rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=5, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

best_rf = rf_random.best_estimator_

In [None]:
#EVALUATING THE TUNED RF MODEL 
rf_pred = best_rf.predict(X_test)
rf_pred = np.round(rf_pred).astype(int)
rf_pred = np.clip(rf_pred, 1, 5) 

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest Regressor → MAE: {rf_mae}, RMSE: {rf_rmse}")

In [None]:
#TRAINING THE XGBOOST REGRESSOR MODEL
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)


In [None]:
#EVALUATE THE XGBOOST MODEL
y_pred_xgb = model.predict(X_test)

dt_pred = np.round(y_pred_xgb).astype(int)
dt_pred = np.clip(dt_pred, 1, 5) 
print(dt_pred)

dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

print(f"Xgboost Regressor → MAE: {dt_mae}, RMSE: {dt_rmse}")

In [15]:
#TRAINING THE XGBOOST WITH RANDOMIZEDSEARCHCV HYPERPARAMETER TUNNING
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb = XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=20, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_

In [None]:
#EVALUATE THE TUNED XGBOOST MODEL
y_pred = best_xgb.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"XGBoost Regressor (Tuned) → MAE: {mae:.4f}, RMSE: {rmse:.4f}")

In [None]:
#TRAINING LIGHTGBM MODEL
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(boosting_type='gbdt', 
                               n_estimators=200, 
                               learning_rate=0.1, 
                               max_depth=7, 
                               subsample=0.8, 
                               colsample_bytree=0.8, 
                               random_state=42)

lgb_model.fit(X_train, y_train)

In [None]:
#EVALUATE THE LIGHTGBM MODEL
y_pred = lgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"LightGBM Regressor → MAE: {mae:.4f}, RMSE: {rmse:.4f}")

In [17]:
#IMPORTING JOBJIB TO SAVE THE MODEL
import joblib

In [None]:
# SAVE THE ENCODER
joblib.dump(target_enc, r"D:\TRANSACTION PROJECT\TARGET ENCODING RATING.pkl")


In [None]:
#SAVE THE SCALER 
joblib.dump(scaler, r"D:\TRANSACTION PROJECT\STANDARD SCALAR RATING.pkl")

In [None]:
#SAVING ONE HOT ENCODER
joblib.dump(ohe, r"D:\TRANSACTION PROJECT\ONE HOT ENCODING RATING.pkl")

In [None]:
#SAVE THE BEST MODEL(TUNED XGBOOST)
joblib.dump(best_xgb, r"D:\TRANSACTION PROJECT\BEST MODEL FOR TOURISM.pkl")