In [9]:
#IMPORTING LIBRARIES FOR HANDLING DATAFRAME AND FOR DATA VISUALIZATION
import pandas as pd 
import numpy as np

In [10]:
#READING THE CSV FILE
Tourism_df = pd.read_csv(r"D:\TRANSACTION PROJECT\Full Tourism Data.csv")

In [None]:
#DISPLAY THE DATAFRAME
Tourism_df.head()

In [None]:
#CHECKING DUPLICATE VALUE
Tourism_df.duplicated().sum()

In [None]:
#CHECKING NULL VALUE
Tourism_df.isna().sum()

In [None]:
Tourism_df.nunique()

In [None]:
#IMPORTING LIBRARIES FOR CONVERTING THE CATEGORICAL TO NUMERIC AND BALANCING THE DATA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,StandardScaler
from category_encoders import TargetEncoder

In [None]:
#CONVERTING CATEGORICAL DATA TO NUMERIC AND BALANCING THE DATA
selected_features = ["UserId", "VisitYear", "VisitMonth", "VisitMode", "AttractionId", 
                     "ContenentId", "RegionId", "Attraction", "AttractionType", "AttractionTypeId"]

X = Tourism_df[selected_features].copy()
y = Tourism_df["VisitModeName"]

categorical_features = ["VisitMode", "AttractionType"]
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = ohe.fit_transform(X[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y))  # Convert to Pandas Series

target_enc = TargetEncoder()
X["Attraction"] = target_enc.fit_transform(X["Attraction"], y)

bool_cols = X.select_dtypes(include=["bool"]).columns
X[bool_cols] = X[bool_cols].astype(int)

X = X.drop(columns=categorical_features)
X = pd.concat([X.reset_index(drop=True), encoded_df], axis=1)

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

y_smote = pd.DataFrame(y_smote, columns=['VisitModeName'])
balanced_tourism_data = pd.concat([X_smote, y_smote], axis=1)

print("Before SMOTE:")
print(pd.Series(y).value_counts())
print("\nAfter SMOTE:")
print(y_smote['VisitModeName'].value_counts())

In [17]:
#IMPORTING LIBRARIES FOR SPLITING,ENCODING AND FEATURE SCALING
from sklearn.model_selection import train_test_split

In [18]:
#IMPORTING LIBRARIES FOR TRAINING
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier

In [19]:
#IMPORTING LIBRARIES FOR EVALUATE THE MODEL
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [20]:
#SPLITTING THE DATA FOR TRAINING AND TESTING
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
#TRAIN THE DECISION TREE CLASSIFIER MODEL
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
#PREDICT THE TEST DATA
dt_pred = dt_model.predict(X_test)
print(dt_pred)

In [None]:
#EVALUATE THE DC TREE MODEL
accuracy = accuracy_score(y_test, dt_pred)
precision = precision_score(y_test, dt_pred, average='weighted')
recall = recall_score(y_test, dt_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [None]:
#TRAIN THE RF CLASSIFIER MODEL
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
#PREDICT THE TEST DATA
rf_pred = rf_model.predict(X_test)
rf_pred

In [None]:
#EVALUATE THE RANDOM FOREST MODEL
accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred, average='weighted')
recall = recall_score(y_test, rf_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [21]:

#TRAIN THE XGBOOST AND PREDICT THE TEST DATA 
xgb_model = XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.2, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
#EVALUATE THE XGBOOST MODEL
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")

evaluate_model("XGBoost", y_test, xgb_pred)

In [23]:
#IMPORTING JOBLIB TO SAVE THE MODEL
import joblib

In [None]:
#SAVE THE BEST MODEL
joblib.dump(xgb_model, r"D:\TRANSACTION PROJECT\BEST MODEL VISITMODE.pkl")

In [None]:
#SAVING ONE HOT ENCODER
joblib.dump(ohe, r"D:\TRANSACTION PROJECT\ohe_for_visitmode.pkl")

In [None]:
# SAVE LABLE ENCODER
joblib.dump(label_encoder, r"D:\TRANSACTION PROJECT\label_encoding_for_visitmode.pkl")


In [None]:
# SAVE TARGET ENCODER
joblib.dump(target_enc, r"D:\TRANSACTION PROJECT\target_encode_for_visitmode.pkl")