In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# LOADING THE TOURISM DATASET FROM A CSV FILE INTO A DATAFRAME  
Tourism_df = pd.read_csv('Dataset_Tourism_final.csv')

In [None]:
Tourism_df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitMode,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,Continent,Region,Country,CityName,Attraction,AttractionAddress,AttractionTypeId,AttractionType,VisitModeName
0,8,7567,2022,10,4,640,5,2,8,48,464.0,America,Northern America,Canada,Ontario,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Friends
1,10,31019,2022,10,3,640,3,5,17,135,583.0,Europe,Central Europe,Switzerland,Zurich,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Family
2,23,20977,2022,10,4,640,5,5,21,163,8258.0,Europe,Western Europe,United Kingdom,Edinburgh,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Friends
3,28,18655,2022,9,2,640,3,1,4,22,114.0,Africa,Southern Africa,South Africa,Durban,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Couples
4,44,2307,2022,9,2,640,4,4,15,109,4303.0,Australia & Oceania,Australia,Australia,Gold Coast,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,Nature & Wildlife Areas,Couples


In [4]:
Tourism_df.duplicated().sum()

0

In [5]:
Tourism_df.isnull().sum()

TransactionId        0
UserId               0
VisitYear            0
VisitMonth           0
VisitMode            0
AttractionId         0
Rating               0
ContinentId          0
RegionId             0
CountryId            0
CityId               8
Continent            0
Region               0
Country              0
CityName             0
Attraction           0
AttractionAddress    0
AttractionTypeId     0
AttractionType       0
VisitModeName        0
dtype: int64

In [6]:
Tourism_df.shape

(52930, 20)

In [7]:
Tourism_df.dropna(inplace=True)

In [8]:
Tourism_df.shape

(52922, 20)

In [9]:
# Normalize Ratings
Tourism_df["Rating"] = MinMaxScaler().fit_transform(Tourism_df[["Rating"]])

# Create User-Attraction Matrix
user_attraction_matrix = Tourism_df.pivot_table(index="UserId", columns="AttractionId", values="Rating", fill_value=0)

In [10]:
# Apply Truncated SVD for Dimensionality Reduction
n_components = min(25, user_attraction_matrix.shape[1])  # Increased components for better reconstruction
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_attraction_matrix_reduced = svd.fit_transform(user_attraction_matrix)

# Train KNN Model with Cosine Similarity
knn_model = NearestNeighbors(n_neighbors=7, metric='cosine', algorithm='auto')  # Increased neighbors for better diversity
knn_model.fit(user_attraction_matrix_reduced)

In [11]:
# Function to Recommend Attractions
def recommend_attractions(user_id, num_recommendations=5):
    if user_id not in user_attraction_matrix.index:
        return "User ID not found! Try with a different ID."

    user_idx = user_attraction_matrix.index.get_loc(user_id)
    distances, indices = knn_model.kneighbors([user_attraction_matrix_reduced[user_idx]], n_neighbors=7)
    
    similar_users = user_attraction_matrix.index[indices.flatten()[1:]]  # Exclude self

    user_ratings = user_attraction_matrix.loc[user_id]
    unseen_attractions = user_ratings[user_ratings == 0].index  # Attractions user has not rated

    attraction_scores = {}
    for sim_user, dist in zip(similar_users, distances.flatten()[1:]):  # Weighted Similarity
        for attraction in unseen_attractions:
            attraction_scores[attraction] = attraction_scores.get(attraction, 0) + (
                user_attraction_matrix.loc[sim_user, attraction] * (1 - dist)
            )

    recommended_attractions = sorted(attraction_scores, key=attraction_scores.get, reverse=True)[:num_recommendations]
    
    return recommended_attractions if recommended_attractions else "No new recommendations found."

In [12]:
# Convert Attraction ID to Names
def map_attractions(recommended_ids):
    attraction_mapping = dict(zip(Tourism_df["AttractionId"], Tourism_df["Attraction"]))
    return [attraction_mapping[att_id] for att_id in recommended_ids if att_id in attraction_mapping]

In [13]:
# Get Recommendations
user_id = 16
recommended_ids = recommend_attractions(user_id)
recommended_names = map_attractions(recommended_ids)

print(f"Recommended Attractions for User {user_id}: {recommended_names}")

Recommended Attractions for User 16: ['Tanah Lot Temple', 'Tegenungan Waterfall', 'Sanur Beach', 'Kuta Beach - Bali', 'Seminyak Beach']


In [14]:
# Evaluate SVD Reconstruction Quality
reconstructed_matrix = svd.inverse_transform(user_attraction_matrix_reduced)
original = user_attraction_matrix.to_numpy()
reconstructed = reconstructed_matrix

rmse = np.sqrt(mean_squared_error(original, reconstructed))
mse = mean_squared_error(original, reconstructed)
mae = mean_absolute_error(original, reconstructed)
r2 = r2_score(original, reconstructed)

print(f"Reconstruction RMSE: {rmse:.4f}")
print(f"Reconstruction MSE: {mse:.4f}")
print(f"Reconstruction MAE: {mae:.4f}")
print(f"Reconstruction R² Score: {r2:.4f}")

Reconstruction RMSE: 0.0107
Reconstruction MSE: 0.0001
Reconstruction MAE: 0.0004
Reconstruction R² Score: 0.8490


In [15]:
import joblib
joblib.dump(svd,"svd(recommend).plk")
joblib.dump(user_attraction_matrix_reduced,'user attraction matrix reduce(recommend).plk')
joblib.dump(knn_model,'KNN(recommend).plk')
joblib.dump(user_attraction_matrix,'use attraction matrix(recommend).plk')
print('All models has been saved')

All models has been saved
