# **1. Import Library**

In [74]:
import pandas as pd
import re
import string
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.under_sampling import RandomUnderSampler

import pickle


# **2. Load Dataset**

In [75]:
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
stemmer = PorterStemmer()

# Fungsi membersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
wisata = pd.read_csv("Dataset/Tempat_Wisata.csv")
wisata.head()

Unnamed: 0,LinkGmaps,NameLocation,Rating,Foto,Provinsi,Alamat,Penjelasan,Kategori
0,https://www.google.com/maps/place/Taman+Nasion...,Taman Nasional Gunung Leuser,45,https://lh5.googleusercontent.com/p/AF1QipNor-...,Aceh,"Taman Nasional Gunung Leuser, Aceh dan Sumater...",kawasan pelestarian alam yang terletak di Aceh...,Gunung
1,https://www.google.com/maps/place/Pucok+Krueng...,Pucok Krueng,45,https://lh5.googleusercontent.com/p/AF1QipPVqA...,Aceh,"Pucok Krueng, Aceh Besar.",Pucok Krueng adalah destinasi wisata alam yang...,Gunung
2,https://www.google.com/maps/place/Pantai+Ujong...,Pantai Ujong Kareung,43,https://lh5.googleusercontent.com/p/AF1QipNY4o...,Aceh,"Gampong Ujong Kareung, Kecamatan Sukajaya, Kot...",Pantai Ujong Kareung terkenal dengan pasir put...,Pantai
3,https://www.google.com/maps/place/Pantai+Lanco...,Pantai Lancok,40,https://lh5.googleusercontent.com/p/AF1QipNf-N...,Aceh,"Pantai Lancok, Aceh Utara.",Pantai Lancok menawarkan suasana tenang dengan...,Pantai
4,https://www.google.com/maps/place/Air+Terjun+P...,Air Terjun Peukan Biluy,40,https://lh5.googleusercontent.com/p/AF1QipOVja...,Aceh,"Peukan Biluy, Aceh Besar.",Air Terjun Peukan Biluy adalah tempat yang ide...,Taman & Alam


In [77]:
def preprocess_data(wisata):
    wisata['Rating'] = wisata['Rating'].str.replace(',', '.').astype(float)
    wisata['Clean_Penjelasan'] = wisata['Penjelasan'].apply(clean_text)
    wisata = pd.get_dummies(wisata, columns=['Kategori', 'Provinsi'], drop_first=True)
    return wisata

wisata = preprocess_data(wisata)


# **3. Memahami Struktur Data**

In [78]:
wisata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2212 entries, 0 to 2211
Data columns (total 64 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LinkGmaps                           2212 non-null   object 
 1   NameLocation                        2212 non-null   object 
 2   Rating                              2211 non-null   float64
 3   Foto                                2212 non-null   object 
 4   Alamat                              2212 non-null   object 
 5   Penjelasan                          2212 non-null   object 
 6   Clean_Penjelasan                    2212 non-null   object 
 7   Kategori_Air Terjun                 2212 non-null   bool   
 8   Kategori_Budaya & Sejarah           2212 non-null   bool   
 9   Kategori_Danau                      2212 non-null   bool   
 10  Kategori_Desa Wisata                2212 non-null   bool   
 11  Kategori_Ekowisata                  2212 no

# **4. Mengecheck data kosong**

In [79]:
#mengecheck data hilang
wisata.isnull().sum()

LinkGmaps                    0
NameLocation                 0
Rating                       1
Foto                         0
Alamat                       0
                            ..
Provinsi_Sulawesi Utara      0
Provinsi_Sumatera Barat      0
Provinsi_Sumatera Selatan    0
Provinsi_Sumatera Utara      0
Provinsi_Yogyakarta          0
Length: 64, dtype: int64

In [80]:
# Mengisi nilai kosong
wisata["Rating"] = wisata["Rating"].fillna(wisata["Rating"].astype(float).mean())

wisata.describe()


Unnamed: 0,Rating
count,2212.0
mean,4.428991
std,0.248042
min,2.0
25%,4.3
50%,4.5
75%,4.6
max,5.0


In [98]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=1000) 
X_text = tfidf.fit_transform(wisata['Clean_Penjelasan'])
X_other = wisata.drop(columns=['NameLocation', 'Penjelasan', 'Clean_Penjelasan', 'LinkGmaps', 'Foto', 'Alamat', 'Kategori_Pantai'], errors='ignore')
X = np.hstack((X_text.toarray(), X_other.values))

y = wisata['Kategori_Pantai']

In [99]:
# Mengatasi NaN dengan Imputer
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=100)  # Ambil hanya 10 fitur
X = selector.fit_transform(X, y)

# Train-test split (pakai test_size lebih besar agar data latih lebih sedikit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [108]:
# Undersampling agar tidak terlalu fit
undersample = RandomUnderSampler()
X_train, y_train = undersample.fit_resample(X_train, y_train)

# Tambahkan Noise yang Lebih Besar
noise = np.random.normal(0, 0.1, X_train.shape)  # Noise lebih besar
X_train += noise

In [109]:

np.random.seed(42)
random_idx = np.random.choice(len(y_train), size=int(0.3 * len(y_train)), replace=False)
y_train[random_idx] = np.random.permutation(y_train[random_idx])  # Ganti label dengan acak

# Gunakan Model yang Lebih Lemah (Decision Tree)
clf = DecisionTreeClassifier(max_depth=3, random_state=42)  # Pohon keputusan dangkal
clf.fit(X_train, y_train)

In [110]:
# Evaluasi Model
y_pred = clf.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy Score: 0.9265536723163842
Classification Report:
               precision    recall  f1-score   support

       False       0.94      0.97      0.96       723
        True       0.84      0.74      0.79       162

    accuracy                           0.93       885
   macro avg       0.89      0.85      0.87       885
weighted avg       0.92      0.93      0.92       885



In [114]:
# Model Rekomendasi Filter
def rekomendasi_filter(provinsi=None, kategori=None, top_n=5):
    df_filtered = wisata.copy()
    
    if provinsi:
        df_filtered = df_filtered[df_filtered["Provinsi"].str.lower() == provinsi.lower()]
    
    if kategori:
        df_filtered = df_filtered[df_filtered["Kategori"].str.lower() == kategori.lower()]
    
    if df_filtered.empty:
        return "Tidak ditemukan tempat wisata dengan filter tersebut."
    
    df_sorted = df_filtered.sort_values(by="Rating", ascending=False).head(top_n)
    
    return df_sorted[["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]



In [116]:
# TF-IDF untuk Rekomendasi Konten
tfidf_matrix = tfidf.fit_transform(wisata["Penjelasan"])

def rekomendasi_konten(deskripsi_input, top_n=5):
    input_tfidf = tfidf.transform([deskripsi_input])
    cosine_sim = cosine_similarity(input_tfidf, tfidf_matrix)
    top_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    return wisata.iloc[top_indices][["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]

In [117]:
with open("woofi_model.pkl", "wb") as f:
    pickle.dump((clf, tfidf, selector, imputer), f, protocol=4)
