In [1]:
#IMPORTING LIBRARIES FOR HANDLING DATAFRAME AND FOR DATA VISUALIZATION
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#READING THE CSV FILE
Tourism_df = pd.read_csv(r"D:\TRANSACTION PROJECT\Full Tourism Data.csv")

In [3]:
#DISPLAY THE TABLE
Tourism_df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitMode,AttractionId,Rating,ContenentId,RegionId,CountryId,CityId,Contenent,Region,Country,CityName,Attraction,AttractionAddress,AttractionTypeId,AttractionType,VisitModeName
0,3,70456,2022,10,2,640,5,5,21,163,4341,Europe,Western Europe,United Kingdom,Guildford,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Couples
1,8,7567,2022,10,4,640,5,2,8,48,464,America,Northern America,Canada,Ontario,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Friends
2,9,79069,2022,10,3,640,5,2,9,54,774,America,South America,Brazil,Brazil,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Family
3,10,31019,2022,10,3,640,3,5,17,135,583,Europe,Central Europe,Switzerland,Zurich,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Family
4,15,43611,2022,10,2,640,3,5,21,163,1396,Europe,Western Europe,United Kingdom,Manchester,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Couples


In [None]:
#CHECKING DUPLICATE VALUE
Tourism_df.duplicated().sum()

In [None]:
#CHECKING NULL VALUE
Tourism_df.isna().sum()

In [None]:
#IMPORTING LIBRARIES FOR CONVERTING THE CATEGORICAL TO NUMERIC AND BALANCING THE DATA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

In [None]:
#CONVERTING THE CATEGORICAL TO NUMERIC AND BALANCING THE DATA
X = Tourism_df.drop(columns=['TransactionId', 'Rating', 'ContenentId', 'RegionId', 
                             'CountryId', 'CityId', 'AttractionAddress', 
                             'AttractionTypeId', 'VisitModeName'])
y = Tourism_df['VisitModeName']

X = pd.get_dummies(X, drop_first=True) 

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

y_smote = pd.DataFrame(y_smote, columns=['VisitModeName'])
balanced_tourism_data = pd.concat([X_smote, y_smote], axis=1)

print("Before SMOTE:")
print(pd.Series(y).value_counts())

print("\nAfter SMOTE:")
print(y_smote['VisitModeName'].value_counts())


In [None]:
#IMPORTING LIBRARIES FOR TRAINING AND EVALUATE THE MODEL
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [None]:
#SPLITTING THE DATA FOR TRAINING AND TESTING
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
#TRAIN THE DECISION TREE CLASSIFIER MODEL
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
#PREDICT THE TEST DATA
dt_pred = dt_model.predict(X_test)
print(dt_pred)

In [None]:
#EVALUATE THE DC TREE MODEL
accuracy = accuracy_score(y_test, dt_pred)
precision = precision_score(y_test, dt_pred, average='weighted')
recall = recall_score(y_test, dt_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [None]:
#IMPORTING JOBLIB TO SAVE THE MODEL
import joblib

In [None]:
#SAVE THE MODEL
joblib.dump(dt_model, r"D:\TRANSACTION PROJECT\dc_model.pkl")

['D:\\TRANSACTION PROJECT\\dc_model.pkl']

In [None]:
#TRAIN THE RF CLASSIFIER MODEL
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
#PREDICT THE TEST DATA
rf_pred = rf_model.predict(X_test)
rf_pred

In [None]:
#EVALUATE THE RANDOM FOREST MODEL
accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred, average='weighted')
recall = recall_score(y_test, rf_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [None]:

#TRAIN THE XGBOOST AND PREDICT THE TEST DATA 
xgb_model = XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.2, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
#EVALUATE THE XGBOOST MODEL
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")

evaluate_model("XGBoost", y_test, xgb_pred)