In [1]:
import pandas as pd
import numpy as np

In [19]:
dataset_path = r"D:\UOM\L4S1\FYP\Datasets\demographic_filter_data.csv"

df = pd.read_csv(dataset_path, sep=';')
#df = df.drop(columns=['Traveler Type'])

In [11]:
df = df.drop('Traveler Type', axis=1)

In [20]:
df.head()

Unnamed: 0,Age Group,Gender,Country,Preferred Destination Category
0,18-30 years,Male,Sri Lanka,Beaches & Coastal Areas
1,51+ years,Female,USA/Canada,Nature & Wildlife
2,51+ years,Female,Middle East,Adventure & Unique Experiences
3,51+ years,Female,USA/Canada,Adventure & Unique Experiences
4,51+ years,Female,Australia/New Zealand,Nature & Wildlife


In [21]:
df = df.sample(n=5000, random_state=42)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

# Separate features and target
X = df.drop(columns=['Preferred Destination Category'])  # Replace with actual target column
y = df['Preferred Destination Category']

le = LabelEncoder()
#X['Gender'] = le.fit_transform(X['Gender'])

oe1 = OrdinalEncoder(categories=[['Solo traveler', 'Traveling with a partner', 'Traveling with friends', 
                                 'Traveling with young kids (under 12)', 
                                 'Traveling with teenagers (12-18)', 
                                 'Traveling with extended family (multi-generational)']])
X['Travel Group'] = oe1.fit_transform(X[['Travel Group']])

# Automatically get unique categories for each column to encode
for col in ['Budget', 'Accommodation', 'Activity Interest', 'Physical Activity Level', 'Experience Level']:
    categories = [sorted(df[col].unique())]
    oe = OrdinalEncoder(categories=categories)
    X[col] = oe.fit_transform(X[[col]])


#X['Country'] = le.fit_transform(X['Country'])

y = le.fit_transform(y)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

# Separate features and target
X = df.drop(columns=['Preferred Destination Category'])  # Replace with actual target column
y = df['Preferred Destination Category']

le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])

oe = OrdinalEncoder(categories=[['18-30 years', '31-50 years', '51+ years']])
X['Age Group'] = oe.fit_transform(X[['Age Group']])

X['Country'] = le.fit_transform(X['Country'])

y = le.fit_transform(y)

In [16]:
# Show mapping between original and encoded 'Travel Group' values
original = df['Budget']
encoded = X['Budget']

mapping_df = pd.DataFrame({'Original': original, 'Encoded': encoded})
mapping_df.drop_duplicates().sort_values('Encoded')

Unnamed: 0,Original,Encoded
51441,Budget/Backpacking,0.0
43197,Luxury,1.0
65022,Mid-range,2.0


In [17]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB


model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [10,20,30],
            'kernel': ['rbf','sigmoid','poly']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [10,50,100]
        }
    },
    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {}
    },
    'multinomial_nb': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.5, 1.0, 1.5]
        }
    },
    'bernoulli_nb': {
        'model': BernoulliNB(),
        'params': {
            'alpha': [0.5, 1.0, 1.5]
        }
    }
}

In [23]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, n_jobs=-1)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

KeyboardInterrupt: 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

model_params = {
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [10, 20, 30],
            'kernel': ['rbf', 'sigmoid', 'poly']
        }
    }
}

# For supervised models (KNN, SVM)
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

results_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
print(results_df)

# For KMeans (unsupervised)
kmeans_scores = []
for k in [2, 3, 4, 5, 6]:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia = kmeans.inertia_
    kmeans_scores.append({'n_clusters': k, 'inertia': inertia})

kmeans_df = pd.DataFrame(kmeans_scores)
print(kmeans_df)