# **1. Import Library**

In [18]:
import pandas as pd
import re
import string
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score


# **2. Load Dataset**

In [19]:
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def preprocess_data(wisata):
    wisata['Rating'] = wisata['Rating'].str.replace(',', '.').astype(float)
    wisata['Clean_Penjelasan'] = wisata['Penjelasan'].astype(str).apply(clean_text)
    
    # Gabungkan kategori duplikat
    wisata['Kategori'] = wisata['Kategori'].replace({"Pantai ": "Pantai"})
    
    # Gabungkan kategori minoritas agar tidak error di SMOTE
    category_counts = wisata['Kategori'].value_counts()
    rare_categories = category_counts[category_counts < 6].index.tolist()
    wisata['Kategori'] = wisata['Kategori'].apply(lambda x: 'Lainnya' if x in rare_categories else x)
    return wisata

wisata = pd.read_csv('Dataset/Tempat_Wisata.csv')
wisata = preprocess_data(wisata)
wisata.head()

Unnamed: 0,LinkGmaps,NameLocation,Rating,Foto,Provinsi,Alamat,Penjelasan,Kategori,Clean_Penjelasan
0,https://www.google.com/maps/place/Taman+Nasion...,Taman Nasional Gunung Leuser,4.5,https://lh5.googleusercontent.com/p/AF1QipNor-...,Aceh,"Taman Nasional Gunung Leuser, Aceh dan Sumater...",kawasan pelestarian alam yang terletak di Aceh...,Gunung,kawasan pelestarian alam terletak aceh sumater...
1,https://www.google.com/maps/place/Pucok+Krueng...,Pucok Krueng,4.5,https://lh5.googleusercontent.com/p/AF1QipPVqA...,Aceh,"Pucok Krueng, Aceh Besar.",Pucok Krueng adalah destinasi wisata alam yang...,Gunung,pucok krueng destinasi wisata alam menawarkan ...
2,https://www.google.com/maps/place/Pantai+Ujong...,Pantai Ujong Kareung,4.3,https://lh5.googleusercontent.com/p/AF1QipNY4o...,Aceh,"Gampong Ujong Kareung, Kecamatan Sukajaya, Kot...",Pantai Ujong Kareung terkenal dengan pasir put...,Pantai,pantai ujong kareung terken pasir putihnya air...
3,https://www.google.com/maps/place/Pantai+Lanco...,Pantai Lancok,4.0,https://lh5.googleusercontent.com/p/AF1QipNf-N...,Aceh,"Pantai Lancok, Aceh Utara.",Pantai Lancok menawarkan suasana tenang dengan...,Pantai,pantai lancok menawarkan suasana tenang pemand...
4,https://www.google.com/maps/place/Air+Terjun+P...,Air Terjun Peukan Biluy,4.0,https://lh5.googleusercontent.com/p/AF1QipOVja...,Aceh,"Peukan Biluy, Aceh Besar.",Air Terjun Peukan Biluy adalah tempat yang ide...,Taman & Alam,air terjun peukan biluy ideal menikmati keinda...


# **3. Memahami Struktur Data**

In [21]:
wisata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2212 entries, 0 to 2211
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   LinkGmaps         2212 non-null   object 
 1   NameLocation      2212 non-null   object 
 2   Rating            2211 non-null   float64
 3   Foto              2212 non-null   object 
 4   Provinsi          2212 non-null   object 
 5   Alamat            2212 non-null   object 
 6   Penjelasan        2212 non-null   object 
 7   Kategori          2212 non-null   object 
 8   Clean_Penjelasan  2212 non-null   object 
dtypes: float64(1), object(8)
memory usage: 155.7+ KB


# **4. Mengecheck data kosong**

In [22]:
#mengecheck data hilang
wisata.isnull().sum()

LinkGmaps           0
NameLocation        0
Rating              1
Foto                0
Provinsi            0
Alamat              0
Penjelasan          0
Kategori            0
Clean_Penjelasan    0
dtype: int64

In [23]:
# Mengisi nilai kosong
wisata["Rating"] = wisata["Rating"].fillna(wisata["Rating"].astype(float).mean())

wisata.describe()


Unnamed: 0,Rating
count,2212.0
mean,4.428991
std,0.248042
min,2.0
25%,4.3
50%,4.5
75%,4.6
max,5.0


In [25]:
# Model Berbasis Filter: Rekomendasi berdasarkan Provinsi & Kategori
def rekomendasi_filter(provinsi=None, kategori=None, top_n=5):
    df_filtered = wisata.copy()
    
    if provinsi:
        df_filtered = df_filtered[df_filtered["Provinsi"].str.lower() == provinsi.lower()]
    
    if kategori:
        df_filtered = df_filtered[df_filtered["Kategori"].str.lower() == kategori.lower()]
    
    if df_filtered.empty:
        return "Tidak ditemukan tempat wisata dengan filter tersebut."
    
    df_sorted = df_filtered.sort_values(by="Rating", ascending=False).head(top_n)
    
    return df_sorted[["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]
#test latihan
rekomendasi_filter(provinsi="Sumatera Selatan", kategori="Gunung/Bukit", top_n=5)


Unnamed: 0,NameLocation,Provinsi,Kategori,Rating,LinkGmaps
1982,Bukit Pendape,Sumatera Selatan,Gunung/Bukit,4.5,https://www.google.com/maps/place/Bukit+Pendap...
1959,Taman Wisata Bukit Cogong,Sumatera Selatan,Gunung/Bukit,4.3,https://www.google.com/maps/place/Taman+Wisata...
1990,Wisata Alam Bukit Sulap Taman Nasional Kerinci...,Sumatera Selatan,Gunung/Bukit,4.2,https://www.google.com/maps/place/Wisata+Alam+...


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(wisata["Penjelasan"])

def rekomendasi_konten(deskripsi_input, top_n=5):
    input_tfidf = tfidf.transform([deskripsi_input])
    
    # Menghitung cosine similarity
    cosine_sim = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Mengambil indeks tempat wisata dengan similarity tertinggi
    top_indices = cosine_sim.argsort()[0][-top_n:][::-1]

    return wisata.iloc[top_indices][["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]

# Contoh rekomendasi berdasarkan deskripsi
rekomendasi_konten("Desa wisata yanag menarik", top_n=5)


Unnamed: 0,NameLocation,Provinsi,Kategori,Rating,LinkGmaps
1540,Desa Tanjung Belit,Riau,Desa Wisata,4.5,https://www.google.com/maps/place/Desa+Tanjung...
1196,Desa Wisata Sumberagung,Lampung,Desa Wisata,4.9,https://www.google.com/maps/place/Desa+Wisata+...
596,Desa Wisata Pekunden,Jawa Tengah,Desa Wisata,4.8,https://www.google.com/maps/place/Desa+Wisata+...
1962,Desa Wisata Kampung Warna Burai,Sumatera Selatan,Desa Wisata,4.5,https://www.google.com/maps/place/Desa+Wisata+...
2204,Desa Wisata Rumah Domes,Yogyakarta,Desa Wisata,4.1,https://www.google.com/maps/place/Desa+Wisata+...


# **6. TF-IDF dan Balancing Data**

In [31]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_text = tfidf.fit_transform(wisata['Clean_Penjelasan'])
X_other = wisata.drop(columns=['NameLocation', 'Penjelasan', 'Clean_Penjelasan', 'LinkGmaps', 'Foto', 'Alamat'])
X = np.hstack((X_text.toarray(), X_other.values))

In [32]:
y = wisata['Kategori_Pantai']  # Contoh target (bisa disesuaikan dengan kategori lainnya)


KeyError: 'Kategori_Pantai'

In [16]:
print(wisata.columns)


Index(['LinkGmaps', 'NameLocation', 'Rating', 'Foto', 'Provinsi', 'Alamat',
       'Penjelasan', 'Kategori', 'Clean_Penjelasan'],
      dtype='object')
