In [31]:
# Import Library
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
stemmer = PorterStemmer()

# Fungsi membersihkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

# Load dataset
wisata = pd.read_csv("Dataset/Tempat_Wisata.csv")

# Preprocessing Data
def preprocess_data(wisata):
    wisata['Rating'] = wisata['Rating'].str.replace(',', '.').astype(float)
    wisata['Clean_Penjelasan'] = wisata['Penjelasan'].apply(clean_text)
    wisata = pd.get_dummies(wisata, columns=['Kategori', 'Provinsi'], drop_first=True)
    return wisata

wisata = preprocess_data(wisata)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000) 
X_text = tfidf.fit_transform(wisata['Clean_Penjelasan'])
X_other = wisata.drop(columns=['NameLocation', 'Penjelasan', 'Clean_Penjelasan', 'LinkGmaps', 'Foto', 'Alamat', 'Kategori_Pantai'], errors='ignore')
X = np.hstack((X_text.toarray(), X_other.values))

y = wisata['Kategori_Pantai']

# Mengatasi NaN dengan Imputer
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=500)  # Ambil hanya 10 fitur
X = selector.fit_transform(X, y)

# Train-test split (pakai test_size lebih besar agar data latih lebih sedikit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Undersampling agar tidak terlalu fit
undersample = RandomUnderSampler()
X_train, y_train = undersample.fit_resample(X_train, y_train)

# Tambahkan Noise yang Lebih Besar
noise = np.random.normal(0, 0.01, X_train.shape)  # Noise lebih besar
X_train += noise

# Ubah 30% label secara acak untuk meningkatkan generalisasi
np.random.seed(42)
random_idx = np.random.choice(len(y_train), size=int(0.3 * len(y_train)), replace=False)
y_train[random_idx] = np.random.permutation(y_train[random_idx])  # Ganti label dengan acak

# Gunakan Model yang Lebih Lemah (Decision Tree)
clf = DecisionTreeClassifier(max_depth=3, random_state=42)  # Pohon keputusan dangkal
clf.fit(X_train, y_train)

# Evaluasi Model
y_pred = clf.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Model Rekomendasi Filter
def rekomendasi_filter(provinsi=None, kategori=None, top_n=5):
    df_filtered = wisata.copy()
    
    if provinsi:
        df_filtered = df_filtered[df_filtered["Provinsi"].str.lower() == provinsi.lower()]
    
    if kategori:
        df_filtered = df_filtered[df_filtered["Kategori"].str.lower() == kategori.lower()]
    
    if df_filtered.empty:
        return "Tidak ditemukan tempat wisata dengan filter tersebut."
    
    df_sorted = df_filtered.sort_values(by="Rating", ascending=False).head(top_n)
    
    return df_sorted[["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]

# TF-IDF untuk Rekomendasi Konten
tfidf_matrix = tfidf.fit_transform(wisata["Penjelasan"])

def rekomendasi_konten(deskripsi_input, top_n=5):
    input_tfidf = tfidf.transform([deskripsi_input])
    cosine_sim = cosine_similarity(input_tfidf, tfidf_matrix)
    top_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    return wisata.iloc[top_indices][["NameLocation", "Provinsi", "Kategori", "Rating", "LinkGmaps"]]

import pickle
with open("rekomendasi_model.pkl", "wb") as f:
    pickle.dump((clf, tfidf, selector, imputer), f, protocol=4)


    
# # Simpan Model
# with open("rekomendasi_model.pkl", "wb") as model_file:
#     selected_features = selector.get_support(indices=True)
#     pickle.dump((clf, tfidf, selector, imputer, selected_features), model_file)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03ann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy Score: 0.9457627118644067
Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.95      0.97       723
        True       0.81      0.92      0.86       162

    accuracy                           0.95       885
   macro avg       0.90      0.94      0.91       885
weighted avg       0.95      0.95      0.95       885



In [33]:
with open("rekomendasi_model.pkl", "rb") as f:
    model_data = pickle.load(f)


In [34]:
for i, item in enumerate(model):
    print(f"Elemen {i}: {type(item)}")


Elemen 0: <class 'sklearn.tree._classes.DecisionTreeClassifier'>
Elemen 1: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Elemen 2: <class 'sklearn.feature_selection._univariate_selection.SelectKBest'>
Elemen 3: <class 'sklearn.impute._base.SimpleImputer'>
Elemen 4: <class 'numpy.ndarray'>
