In [222]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [223]:
df = pd.read_csv('new_bali_dataset.csv', delimiter=';')
df.head()

Unnamed: 0,index,place,url,is_accessibility,rating,n_reviews,address,price,idx_category,category,description,lat,long,Coordinate,Is_accessibility
0,0,Pura Puseh Desa Adat Kutri,https://www.google.com/maps/place/Pura+Puseh+D...,0,3.0,1,"F863+W53, Buruan, Kec. Blahbatuh, Kabupaten Gi...",15000,0,Sejarah,Pura Puseh is one of the oldest temples in Bal...,-8.514987,115.298791,"-8.514986706754593, 115.29879103124946",False
1,1,Pura Dalem Begawan,https://www.google.com/maps/place/Pura+Dalem+B...,0,3.0,2,"8682+87P, Dauh Puri Kauh, Kec. Denpasar Bar., ...",15000,0,Sejarah,"Located near the Petanu River, Pura Dalem Bega...",-8.682596,115.20061,"-8.682595632877103, 115.20061015733499",False
2,2,D'tukad Desa Budaya Kertalangu,https://www.google.com/maps/place/D'tukad+Desa...,0,3.6,9,"9744+WJQ, Kesiman Kertalangu, Kec. Denpasar Ti...",20000,0,Sejarah,D'tukad Desa Budaya Kertalangu is a cultural t...,-8.639347,115.257474,"-8.639347199121982, 115.25747408289917",False
3,3,Desa Adat Cangkup,https://www.google.com/maps/place/Desa+Adat+Ca...,0,3.7,3,"Pupuan Sawah, Kec. Selemadeg, Kabupaten Tabana...",15000,0,Sejarah,Desa Adat Cangkup is a traditional village tha...,-8.439189,115.051934,"-8.43918883821229, 115.05193408611053",False
4,4,Monumen Perjuangan Desa Tegaljadi,https://www.google.com/maps/place/Monumen+Perj...,0,4.0,1,"G585+J3, Tegaljadi, Kec. Marga, Kabupaten Taba...",15000,0,Sejarah,This monument was erected to commemorate the s...,-8.483242,115.157706,"-8.4832418767373, 115.15770565767143",False


In [224]:
df.shape

(195, 15)

# Preproses

In [225]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             195 non-null    int64  
 1   place             195 non-null    object 
 2   url               195 non-null    object 
 3   is_accessibility  195 non-null    int64  
 4   rating            195 non-null    float64
 5   n_reviews         195 non-null    int64  
 6   address           195 non-null    object 
 7   price             195 non-null    int64  
 8   idx_category      195 non-null    int64  
 9   category          195 non-null    object 
 10  description       195 non-null    object 
 11  lat               163 non-null    float64
 12  long              163 non-null    float64
 13  Coordinate        163 non-null    object 
 14  Is_accessibility  163 non-null    object 
dtypes: float64(3), int64(5), object(7)
memory usage: 23.0+ KB


In [226]:
df['price'].fillna(0, inplace=True)
df['category'].fillna('unknown', inplace=True)


In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             195 non-null    int64  
 1   place             195 non-null    object 
 2   url               195 non-null    object 
 3   is_accessibility  195 non-null    int64  
 4   rating            195 non-null    float64
 5   n_reviews         195 non-null    int64  
 6   address           195 non-null    object 
 7   price             195 non-null    int64  
 8   idx_category      195 non-null    int64  
 9   category          195 non-null    object 
 10  description       195 non-null    object 
 11  lat               163 non-null    float64
 12  long              163 non-null    float64
 13  Coordinate        163 non-null    object 
 14  Is_accessibility  163 non-null    object 
dtypes: float64(3), int64(5), object(7)
memory usage: 23.0+ KB


In [228]:

# Function for removing NonAscii characters
def _removeNonAscii(text):
    return "".join(i for i in text if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Applying all the functions in description and storing as a cleaned_desc
df['cleaned_desc'] = df['description'].apply(_removeNonAscii)
df['cleaned_desc'] = df.cleaned_desc.apply(func = make_lower_case)
df['cleaned_desc'] = df.cleaned_desc.apply(func = remove_stop_words)
df['cleaned_desc'] = df.cleaned_desc.apply(func=remove_punctuation)
df['cleaned_desc'] = df.cleaned_desc.apply(func=remove_html)

# Recommendation

In [229]:
def recommendation(index):
    
    place = df.loc[index, 'place']
    category = df.loc[index, 'category']
    # Matching the category with the dataset and reset the index
    data_category = df[df['category'] == category].reset_index(drop=True)
  
    # Convert the index into series
    indices = pd.Series(data_category.index, index=data_category['place'])
    
    # Converting the place description into vectors
    tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df=1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data_category['cleaned_desc'])
    
    # Calculating the similarity measures based on Cosine Similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Get the index corresponding to the place
    idx = indices[place]
    
    # Get the pairwise similarity scores 
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the places
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Scores of the 5 most similar places
    sim_scores = sim_scores[1:10]
    
    # Place indices
    place_indices = [i[0] for i in sim_scores]
   
    # Top 5 place recommendations
    records = data_category['place'].iloc[place_indices]
    return records

In [230]:
recommendation(160)

5      Pura Dalem Desa Adat Negara Batuan
2                Pura Jagatnatha Jembrana
0                      Pura Luhur Srijong
1                           Pura Perancak
3    Nista mandala pura luhur rambut siwi
4                        Pura Geger Beach
Name: place, dtype: object

In [231]:
recommendation(160)

5      Pura Dalem Desa Adat Negara Batuan
2                Pura Jagatnatha Jembrana
0                      Pura Luhur Srijong
1                           Pura Perancak
3    Nista mandala pura luhur rambut siwi
4                        Pura Geger Beach
Name: place, dtype: object

In [232]:
def sort_place(affordable_places,lat_user, long_user):
    # get latitude and longitude from affordable_places

    # calculate euclidean distance from user to places
    distances = np.sqrt(((affordable_places['lat'] - lat_user)**2) + ((affordable_places['long'] - long_user)**2))
    affordable_places['distance'] = distances
    
    # sort dataframe based on distance
    affordable_places = affordable_places.sort_values(by='distance', ascending=False)
    
    return affordable_places

    

In [257]:
def create_recommendation(idx_selected, budget, days, lat_user, long_user, is_accessibility=0):
    # create recommendation
    recommended_places = set()
    # selected_places = df.iloc[idx_selected][['place', 'category']].values
    for idx in idx_selected:
        recommended_places.update(recommendation(idx))

    filtered_places = df[df['place'].isin(recommended_places)]
    
    # filter based on accessibility
    if is_accessibility == 1:
        filtered_places = filtered_places[filtered_places['is_accessibility'] == 1]
   
    #filter based on budget
    affordable_places = filtered_places[filtered_places['price'] <= budget / days]

    # Sort places by Google Maps Rating and then by Review Count
    affordable_places = affordable_places.sort_values(by=['rating', 'n_reviews'], ascending=[False, False])
    # return affordable_places
    # sort by distance
    affordable_places = sort_place(affordable_places, lat_user, long_user)

    # Create itinerary
    itinerary = {}
    list_of_dest = []
    total_cost = 0
    places_per_day = 3
    all_places = filtered_places.sort_values(by=['rating', 'n_reviews'], ascending=[False, False])

    used_places = set()
    for day in range(1, days + 1):
        daily_itinerary = affordable_places[~affordable_places['place'].isin(used_places)].head(places_per_day)
        
        # Fallback if no affordable places are left
        if daily_itinerary.empty:
            daily_itinerary = all_places[~all_places['place'].isin(used_places)].head(places_per_day)
        
        # Ensure at least one place per day
        while len(daily_itinerary) < places_per_day and not all_places[~all_places['place'].isin(used_places)].empty:
            additional_place = all_places[~all_places['place'].isin(used_places)].head(1)
            daily_itinerary = pd.concat([daily_itinerary, additional_place])
        
        for row in daily_itinerary.iterrows():
            dest_dict = {}
            dest_dict['place'] = row[1]['place']
            dest_dict['url'] = row[1]['url']
            dest_dict['address'] = row[1]['address']
            dest_dict['is_accessibility'] = row[1]['is_accessibility']
            dest_dict['rating'] = row[1]['rating']
            dest_dict['n_reviews'] = row[1]['n_reviews']
            dest_dict['price'] = row[1]['price']
            dest_dict['category'] = row[1]['category']
            dest_dict['description'] = row[1]['description']
            list_of_dest.append(dest_dict)
            
        # itinerary[f'Day {day}'] = {
        #     'place' : daily_itinerary['place'].tolist(),
        #     'url' : daily_itinerary['url'].tolist(),
        #     'is_accessibility' : daily_itinerary['is_accessibility'].tolist(),
        #     'rating' : daily_itinerary['rating'].tolist(),
        #     'n_reviews' : daily_itinerary['n_reviews'].tolist(),
        #     'address' : daily_itinerary['address'].tolist(),
        #     'price' : daily_itinerary['price'].tolist(),
        #     'category' : daily_itinerary['category'].tolist(),
        #     'description' : daily_itinerary['description'].tolist(),
        #     'lat' : daily_itinerary['lat'].tolist(),
        #     'long' : daily_itinerary['long'].tolist()
        # }
        # print(list_of_dest)
        # itinerary[f'Day {day}'] = daily_itinerary['place'].tolist()
        itinerary[f'Day {day}'] = list_of_dest
        daily_cost = daily_itinerary['price'].sum()
        total_cost += daily_cost
        
        # Remove selected places from affordable_places and all_places to avoid duplicates
        used_places.update(daily_itinerary['place'])

    #     print(f"Jadwal Hari {day} :")
    #     for i, place in enumerate(daily_itinerary['place'], 1):
    #         print(f"{i}. {place}")
    #     print(f"Biaya yang dikeluarkan untuk hari {day} : Rp {daily_cost}\n")
    
    # print(f"Total biaya yang dikeluarkan selama {days} hari : Rp {total_cost}\n")
    return itinerary
    

    

In [258]:
### TRY TO IMPLEMENT THE RECOMMENDATION SYSTEM HERE ###
idx_selected = [80,21,24]
budget = 100_000
days = 3
lat_user = -8.409518
long_user = 115.188919
is_accessibility = 0

get_recommendation = create_recommendation(idx_selected,budget, days, lat_user, long_user, is_accessibility)
get_recommendation

{'Day 1': [{'place': 'Pantai Teluk Terima',
   'url': 'https://www.google.com/maps/place/Pantai+Karang+Sewu/@-8.154945,114.5221425,17z/data=!3m1!4b1!4m6!3m5!1s0x2dd14275c7b4da25:0x63c1a19ef6d47c7d!8m2!3d-8.154945!4d114.5221425!16s%2Fg%2F11fx8bkzxl?entry=ttu',
   'address': 'RGWC+2VC, Sumber Klampok, Kec. Gerokgak, Kabupaten Buleleng, Bali 81155',
   'is_accessibility': 0,
   'rating': 4.2,
   'n_reviews': 124,
   'price': 15000,
   'category': 'Pantai',
   'description': 'A tranquil and secluded beach in West Bali, Teluk Terima offers stunning ocean views and a peaceful atmosphere. This beach is ideal for snorkeling and diving with its clear water and beautiful coral reefs.'},
  {'place': 'Pantai Candikusuma',
   'url': 'https://www.google.com/maps/place/Pantai+Candikusuma/@-8.3080765,113.9412762,10z/data=!4m11!1m3!2m2!1stujuan+wisata+di+bali!6e1!3m6!1s0x2dd15e1bffae1d07:0x9a02e805abc254f8!8m2!3d-8.3080765!4d114.5180584!15sChV0dWp1YW4gd2lzYXRhIGRpIGJhbGlaFyIVdHVqdWFuIHdpc2F0YSBkaSBiYWx

In [237]:
def to_json(idx_recom):
    # decode all list

    # create json
    
    return None