In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Membaca Data dari Dataset
destinations = pd.read_csv('/content/drive/MyDrive/data/Data_Destination_Tourism_West_java.csv', encoding = 'ISO-8859-1')
ratings = pd.read_csv('/content/drive/MyDrive/data/tourism_rating.csv')

# Jumlah data
print('Jumlah tempat wisata: ', len(destinations.Place_Id.unique()))
print('Jumlah rating: ', len(ratings.Place_Ratings))

# Konversi kolom 'Latitude' ke float64
destinations['Latitude'] = pd.to_numeric(destinations['Latitude'], errors='coerce')

# Konversi kolom 'Rating ke float64
destinations['Rating'] = pd.to_numeric(destinations['Rating'], errors = 'coerce')

# Periksa kembali informasi dataset
print(destinations.info())


destinations.head()
ratings.info()
ratings.head()
ratings.describe()

destinations = destinations.drop(['Description', 'Price', 'Rating', 'No.Telepon', 'Coordinate', 'Latitude', 'Longitude'], axis = 1)
# Pengecekan missing value destinasi wisata
destinations.isnull().sum()

# Pengecekan missing value rating pengguna
ratings.isnull().sum()

# Pengecekan data duplikan

print(f'Jumlah data destinasi wisata yang duplikat: {destinations.duplicated().sum()}')
print(f'Jumlah data rating pengguna wisata yang duplikat: {ratings.duplicated().sum()}')

# Menghapus data duplikat (Ratings)
ratings.drop_duplicates(inplace = True)

from sklearn.feature_extraction.text import TfidfVectorizer
# Mengganti '_' dengan spasi dan menghapus spasi ekstra pada kolom 'City'
destinations['City'] = destinations['City'].apply(lambda x: x.replace('_', ' ') if '_' in x else x)
destinations['City'] = destinations['City'].apply(lambda x: ' '.join(x.split()))

# Membuat objek TfidfVectorizer
tf = TfidfVectorizer()

# Melatih vektorizer dengan data 'City'
tf.fit(destinations['City'])

# Mendapatkan daftar fitur (kata) yang diekstrak dari 'City'
feature_names = tf.get_feature_names_out()

tfidf_matrix = tf.fit_transform(destinations['City'])
tfidf_matrix.shape

tfidf_matrix.todense()

pd.DataFrame(
    tfidf_matrix.todense(),
    columns = tf.get_feature_names_out(),
    index = destinations.Place_Name
).sample(15, axis = 0)

from sklearn.metrics.pairwise import cosine_similarity

cosine_simlrty = cosine_similarity(tfidf_matrix)
cosine_simlrty

cosine_sim_df = pd.DataFrame(
    cosine_simlrty, index=destinations['Place_Name'], columns=destinations['Place_Name'])

print('Shape: ', cosine_sim_df.shape)
cosine_sim_df.sample(20, axis = 0)

def place_recommendation(place_name, similarity_data = cosine_sim_df, items = destinations[['Place_Name', 'Category', 'City']], k =10):
  index = similarity_data.loc[:,place_name].to_numpy().argpartition(range(-1, -k, -1))
  clossest = similarity_data.columns[index[-1:-(k+2):-1]]
  clossest = clossest.drop(place_name, errors = 'ignore')
  return pd.DataFrame(clossest).merge(items).head(k)

place_name = 'Jalan Braga'
destinations[destinations.Place_Name.eq(place_name)]

place_recommendation(place_name)

Jumlah tempat wisata:  523
Jumlah rating:  10300
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Place_Id     523 non-null    int64  
 1   Place_Name   523 non-null    object 
 2   Description  523 non-null    object 
 3   Category     523 non-null    object 
 4   City         523 non-null    object 
 5   Price        523 non-null    int64  
 6   Rating       431 non-null    float64
 7   No.Telepon   523 non-null    object 
 8   Coordinate   523 non-null    object 
 9   Latitude     521 non-null    float64
 10  Longitude    523 non-null    float64
dtypes: float64(3), int64(2), object(6)
memory usage: 45.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10300 entries, 0 to 10299
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        10300 non-null  int6

Unnamed: 0,Place_Name,Category,City
0,Taman Hutan Raya Ir. H. Juanda,Cagar Alam,Bandung
1,Taman Begonia,Cagar Alam,Bandung
2,Tafso Barn,Cagar Alam,Bandung
3,Curug Cimahi,Cagar Alam,Bandung
4,The Lodge Maribaya,Cagar Alam,Bandung
5,Kawah Rengganis Cibuni,Cagar Alam,Bandung
6,Taman Kupu-Kupu Cihanjuang,Cagar Alam,Bandung
7,Taman Kupu-Kupu Cihanjuang,Cagar Alam,Cimahi
8,Glamping Lakeside Rancabali,Taman Hiburan,Bandung
9,Situ Patenggang,Cagar Alam,Bandung


In [5]:
# Menyimpan DataFrame ke dalam file .pickle
destinations.to_pickle('/content/drive/MyDrive/data/destinations_data.pkl')

# Menyimpan DataFrame ke dalam file .pkl
ratings.to_pickle('/content/drive/MyDrive/data/ratings_data.pkl')
