In [None]:
import pandas as pd

raw_data = pd.read_csv('../data/tourism_dynamic_routes_1345.csv')
print(f'The raw_data.shape is {raw_data.shape}')
print(raw_data.head())

In [None]:
# Check column types and missing values
print("=== Raw Data Info ===")
print(raw_data.info())

print("\n=== Missing Values ===")
print(raw_data.isnull().sum())

print("\n=== Descriptive Statistics ===")
print(raw_data.describe())

In [None]:
# Handle missing values
df = raw_data.copy()
df['Event_Impact'] = df['Event_Impact'].fillna('None')

In [None]:
y = df['Preferred_Theme']

X = df[['Age', 'Gender', 'Nationality', 'Travel_Companions', 'Budget_Category']].copy()

print("The first 5 rows of features:")
display(X.head())

In [None]:
print(X['Travel_Companions'].unique())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Map Budget_Category to numerical values
budget_map = {'Low': 0, 'Medium': 1, 'High': 2}
X['Budget_Category'] = X['Budget_Category'].map(budget_map)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, shuffle=True
)

# Define feature lists
onehot_features = ['Gender', 'Nationality', 'Travel_Companions']
numeric_features = ['Age', 'Budget_Category']

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), onehot_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f'X_train_processed shape: {X_train_processed.shape}')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, auc, roc_curve

rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

rf.fit(X_train_processed, y_train)

y_pred = rf.predict(X_test_processed)

print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
import joblib
import os

os.makedirs('../../ml_logic/models/theme_predict/', exist_ok=True)

joblib.dump(preprocessor, '../../ml_logic/models/theme_predict/theme_preprocessor.pkl')
joblib.dump(rf, '../../ml_logic/models/theme_predict/theme_rf_model.pkl')
joblib.dump(le, '../../ml_logic/models/theme_predict/theme_label_encoder.pkl')

print("Theme predict components are saved successfully!")

In [None]:
print(y.unique())

In [None]:
def classify_travel_companion(babies, children, adults, olders):
    total_people = adults + children + babies + olders
    
    if total_people == 1:
        return 'Solo'
    
    if adults >=1 and (babies > 0 or children > 0 or olders > 0):
        return 'Family'
    
    if total_people > 6:
        return 'Group'
    else:
        return 'Friends'