# SL-VOYAGER: Tourist Attraction Recommendation System

In [1]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime
import ast
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib

# --- Constants and Setup ---
WEATHER_API_KEY = '714502160807c4d7a00552387f3748f7'  # Replace with your OpenWeatherMap API key

# Month mapping for parsing Best Weather
month_map = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
    'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12,
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

In [2]:
# Season detection for Sri Lanka
def get_current_season(month=None):
    if month is None:
        month = datetime.now().month
    if month in [1, 2, 3]:
        return 'Dry Season'
    elif month in [4, 5]:
        return 'First Inter-Monsoon'
    elif month in [6, 7, 8, 9]:
        return 'Southwest Monsoon'
    else:  # 10, 11, 12
        return 'Second Inter-Monsoon'

# Parse Best Weather to list of months
def parse_best_weather(weather):
    if pd.isna(weather) or weather.lower() == 'all year':
        return list(range(1, 13))
    weather = weather.lower().replace(',', ' ').split()
    months = [month_map.get(w, None) for w in weather if month_map.get(w, None)]
    return months if months else [1]

# Parse latitude/longitude
def parse_lat_lon(coord):
    try:
        return float(str(coord).split('°')[0])
    except:
        return None

## Data Preprocessing

In [3]:
# Load dataset
try:
    df = pd.read_csv('final_attractions.csv', encoding='latin')
except FileNotFoundError:
    raise FileNotFoundError("Please ensure 'final_attractions.csv' exists.")

# Verify columns
expected_columns = ['Name', 'Best Weather', 'Traveler Type', 'Activity Type', 'Historic Term', 'Latitude', 'Longitude', 'Description']
if not all(col in df.columns for col in expected_columns):
    raise ValueError(f"Dataset must contain: {expected_columns}")

# Handle missing values
df['Historic Term'] = df['Historic Term'].fillna('Not Applicable')
df['Description'] = df['Description'].fillna('Description not available.')

# Parse coordinates
df['Latitude'] = df['Latitude'].apply(parse_lat_lon)
df['Longitude'] = df['Longitude'].apply(parse_lat_lon)

# Parse Best Weather
df['Best Months'] = df['Best Weather'].apply(parse_best_weather)

# Generate Seasons
df['Seasons'] = df['Best Months'].apply(lambda months: list(set([get_current_season(m) for m in months])))

# Split Traveler Type and Activity Type into lists
df['Traveler Type'] = df['Traveler Type'].apply(lambda x: [t.strip() for t in str(x).split(',')])
df['Activity Type'] = df['Activity Type'].apply(lambda x: [a.strip() for a in str(x).split(',')])

# Save preprocessed data
df.to_csv('Places.csv', index=False)
print(f"Places.csv created with {len(df)} attractions")

Places.csv created with 79 attractions


## Model Training

In [4]:
# Load preprocessed data
data = pd.read_csv('Places.csv')
data['Best Months'] = data['Best Months'].apply(ast.literal_eval)
data['Seasons'] = data['Seasons'].apply(ast.literal_eval)
data['Traveler Type'] = data['Traveler Type'].apply(ast.literal_eval)
data['Activity Type'] = data['Activity Type'].apply(ast.literal_eval)

# Initialize encoders
traveler_mlb = MultiLabelBinarizer()
activity_mlb = MultiLabelBinarizer()

# Fit encoders
traveler_mlb.fit(data['Traveler Type'])
activity_mlb.fit(data['Activity Type'])

# Encode features
data['Traveler Type Vec'] = list(traveler_mlb.transform(data['Traveler Type']))
data['Activity Type Vec'] = list(activity_mlb.transform(data['Activity Type']))

# Generate synthetic training data
def score_attraction(attraction, tt, at, month):
    score = 0
    score += sum(t in attraction['Traveler Type'] for t in tt) / len(tt) * 0.4  # Weight traveler type
    score += sum(a in attraction['Activity Type'] for a in at) / len(at) * 0.4  # Weight activity type
    score += 1 if month in attraction['Best Months'] or attraction['Best Months'] == list(range(1, 13)) else 0  # Bonus for matching month
    return score * 0.2  # Normalize to [0,1]

training_data = []
for _, attr in data.iterrows():
    for _ in range(100):  # Increased to 100 samples per attraction for better training
        tt = np.random.choice(traveler_mlb.classes_, np.random.randint(1, 3), replace=False)
        at = np.random.choice(activity_mlb.classes_, np.random.randint(1, 3), replace=False)
        month = np.random.randint(1, 13)
        score = score_attraction(attr, tt, at, month)
        training_data.append((tt, at, month, attr['Name'], score))

# Prepare features
X_rf, y_rf, X_knn, y_knn = [], [], [], []
for tt, at, month, name, score in training_data:
    tt_vec = traveler_mlb.transform([tt])[0]
    at_vec = activity_mlb.transform([at])[0]
    month_vec = np.zeros(12)
    month_vec[month-1] = 1
    user_vec = np.concatenate([tt_vec, at_vec, month_vec])
    attr_vec = np.concatenate([
        data.loc[data['Name'] == name, 'Traveler Type Vec'].iloc[0],
        data.loc[data['Name'] == name, 'Activity Type Vec'].iloc[0]
    ])
    X_rf.append(np.concatenate([user_vec, attr_vec]))
    y_rf.append(score)
    X_knn.append(user_vec)
    y_knn.append(name)

# Train models
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=3)

# Split data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
X_knn_train, X_knn_test, y_knn_train, y_knn_test = train_test_split(X_knn, y_knn, test_size=0.2, random_state=42)

# Train and evaluate with cross-validation
rf_model.fit(X_rf_train, y_rf_train)
knn_model.fit(X_knn_train, y_knn_train)

rf_pred = rf_model.predict(X_rf_test)
knn_pred = knn_model.predict(X_knn_test)
print(f"Random Forest MSE: {mean_squared_error(y_rf_test, rf_pred):.4f}")
rf_cv_scores = cross_val_score(rf_model, X_rf, y_rf, cv=5, scoring='neg_mean_squared_error')
print(f"Random Forest Cross-validated MSE: {-rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")
print(f"KNN Accuracy: {accuracy_score(y_knn_test, knn_pred):.2f}")

# Optional: Hyperparameter tuning (uncomment to use)

from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [20, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_rf, y_rf)
print(f"Best parameters: {grid_search.best_params_}")
rf_model = grid_search.best_estimator_


# Save models and encoders
joblib.dump({
    'rf_model': rf_model,
    'knn_model': knn_model,
    'traveler_mlb': traveler_mlb,
    'activity_mlb': activity_mlb
}, 'model_data.pkl')
print("Model and encoders saved as model_data.pkl")

Random Forest MSE: 0.0004
Random Forest Cross-validated MSE: 0.0043 (+/- 0.0071)
KNN Accuracy: 0.01
Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 20}
Model and encoders saved as model_data.pkl
