In [121]:
import pandas as pd
import numpy as np

In [122]:
raw_data = pd.read_csv('../data/tourism_dynamic_routes_1345.csv')
print(f'The raw_data.shape is {raw_data.shape}')
print(raw_data.head())

The raw_data.shape is (1345, 18)
   Route_ID  User_ID                   Sequence  Total_Duration  Total_Cost  \
0         1       23      42->32->48->20->29->9             466        1541   
1         2      124          7->44->43->50->30             151        3817   
2         3      135   27->3->32->28->2->36->31             559        8087   
3         4       86  38->2->20->19->40->39->29             159        2218   
4         5      137                  27->24->3             466        1388   

  Weather Traffic_Level Crowd_Density Event_Impact   Optimal_Route_Preference  \
0   Sunny        Medium           Low     Festival      9->20->29->32->42->48   
1   Rainy          High          High     Festival          7->30->43->44->50   
2   Rainy           Low          High          NaN   2->3->27->28->31->32->36   
3   Snowy        Medium          High     Festival  2->19->20->29->38->39->40   
4   Snowy           Low        Medium      Holiday                  3->24->27   

   Sa

In [123]:
# Check column types and missing values
print("=== Raw Data Info ===")
print(raw_data.info())

print("\n=== Missing Values ===")
print(raw_data.isnull().sum())

print("\n=== Descriptive Statistics ===")
print(raw_data.describe())

=== Raw Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Route_ID                  1345 non-null   int64 
 1   User_ID                   1345 non-null   int64 
 2   Sequence                  1345 non-null   object
 3   Total_Duration            1345 non-null   int64 
 4   Total_Cost                1345 non-null   int64 
 5   Weather                   1345 non-null   object
 6   Traffic_Level             1345 non-null   object
 7   Crowd_Density             1345 non-null   object
 8   Event_Impact              885 non-null    object
 9   Optimal_Route_Preference  1345 non-null   object
 10  Satisfaction_Score        1345 non-null   int64 
 11  Age                       1345 non-null   int64 
 12  Gender                    1345 non-null   object
 13  Nationality               1345 non-null   object
 14  Tr

In [124]:
# Handle missing values
df = raw_data.copy()
df['Event_Impact'] = df['Event_Impact'].fillna('None')

In [125]:
y = df['Preferred_Theme']

X = df[['Age', 'Gender', 'Nationality', 'Travel_Companions', 'Budget_Category']].copy()

print("The first 5 rows of features:")
display(X.head())

The first 5 rows of features:


Unnamed: 0,Age,Gender,Nationality,Travel_Companions,Budget_Category
0,61,Other,USA,Family,Low
1,18,Male,India,Group,High
2,68,Male,USA,Group,Medium
3,38,Female,USA,Friends,Low
4,35,Female,China,Solo,High


In [126]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Map Budget_Category to numerical values
budget_map = {'Low': 0, 'Medium': 1, 'High': 2}
X['Budget_Category'] = X['Budget_Category'].map(budget_map)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, shuffle=True
)

# Define feature lists
onehot_features = ['Gender', 'Nationality', 'Travel_Companions']
numeric_features = ['Age', 'Budget_Category']

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), onehot_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f'X_train_processed shape: {X_train_processed.shape}')


X_train_processed shape: (1076, 16)


In [129]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, auc, roc_curve

rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

rf.fit(X_train_processed, y_train)

y_pred = rf.predict(X_test_processed)

print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

   Adventure       1.00      1.00      1.00        34
    Cultural       1.00      1.00      1.00        29
        Food       0.96      0.99      0.97        72
      Nature       1.00      0.93      0.96        42
  Relaxation       0.98      1.00      0.99        45
    Shopping       1.00      1.00      1.00        47

    accuracy                           0.99       269
   macro avg       0.99      0.99      0.99       269
weighted avg       0.99      0.99      0.99       269



In [131]:
print(y.unique())

['Relaxation' 'Shopping' 'Adventure' 'Cultural' 'Food' 'Nature']
