# **1. Import Library**

In [1]:


import json
import pandas as pd
import joblib
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# **2. Load Model**

In [2]:
with open("Dataset/test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

woofi_model = joblib.load("woofi_model.pkl")
wisata_df = woofi_model[2]  # DataFrame tempat wisata

kategori_columns = [col for col in wisata_df.columns if col.startswith("Kategori_")]
kategori_series = wisata_df[kategori_columns].idxmax(axis=1).str.replace("Kategori_", "", regex=False).str.strip()
kategori_series.name = "Kategori"
wifi_df = pd.concat([wisata_df[["NameLocation"]], kategori_series], axis=1)


# **3. Menghitung Umur**

In [3]:
def calculate_age(birth_date_str):
    birth_date = datetime.fromisoformat(birth_date_str.replace('Z', '+00:00'))
    today = datetime.today()
    return today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))

# **4. Data User**

In [4]:
def preprocess_user_data(test_data, wisata_df):
    user_rows = []

    for user in test_data:
        if not user.get("gender") or not user.get("birth_date"):
            continue

        gender = 1 if user["gender"].lower() == "female" else 0
        try:
            age = calculate_age(user["birth_date"])
        except:
            continue

        interests = user.get("interest", [])
        searches = user.get("searchs", [])

        if not isinstance(searches, list):
            continue

        for s in searches:
            place_name = s.get("name")
            count = s.get("count", 1)
            if not place_name:
                continue

            match = wisata_df[wisata_df["NameLocation"] == place_name]
            if not match.empty:
                kategori = match.iloc[0]["Kategori"]
                label = match.iloc[0]["NameLocation"]

                user_rows.append({
                    "gender": gender,
                    "age": age,
                    "interest": interests,
                    "kategori": kategori,
                    "count": count,
                    "label": label
                })

    return pd.DataFrame(user_rows)

In [5]:
user_df = preprocess_user_data(test_data, wifi_df)

# **5. Pelatihan**

In [6]:
# One-hot untuk interest multi-label
mlb = MultiLabelBinarizer()
interest_encoded = pd.DataFrame(mlb.fit_transform(user_df["interest"]), columns=[f"interest_{c}" for c in mlb.classes_])

# One-hot untuk kategori
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
kategori_encoded = ohe.fit_transform(user_df[["kategori"]])
kategori_encoded = pd.DataFrame(kategori_encoded, columns=ohe.get_feature_names_out(["kategori"]))

# Gabungkan semua fitur
X = pd.concat([
    interest_encoded.reset_index(drop=True),
    kategori_encoded.reset_index(drop=True),
    user_df[["gender", "age", "count"]].reset_index(drop=True)
], axis=1)

y = user_df["label"]


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

print(f"Validation Accuracy: {model.score(X_val, y_val):.2f}")

Validation Accuracy: 0.95


# **5. Export**

In [8]:
joblib.dump(model, "rekomendasi_wisata_model.pkl")
joblib.dump(ohe, "rekomendasi_encoder_kategori.pkl")
joblib.dump(mlb, "rekomendasi_encoder_interest.pkl")
print("✅ Model dan encoder berhasil disimpan.")

✅ Model dan encoder berhasil disimpan.


# **6. Testing**

In [9]:
# Tambah kategori utama ke wisata_df
kategori_series = wisata_df[kategori_columns].idxmax(axis=1).str.replace("Kategori_", "", regex=False).str.strip()
wisata_df["Kategori"] = kategori_series

# --- Fungsi untuk user baru ---
def recommend_for_new_user(gender, age, interest_list, top_n=5):
    # Load model dan encoder
    model = joblib.load("rekomendasi_wisata_model.pkl")
    ohe = joblib.load("rekomendasi_encoder_kategori.pkl")
    mlb = joblib.load("rekomendasi_encoder_interest.pkl")

    # Encode interest
    interest_data = mlb.transform([interest_list])
    interest_df = pd.DataFrame(interest_data, columns=[f"interest_{c}" for c in mlb.classes_])

    # Dummy kategori (wajib untuk bentuk input, tidak dipakai di prediksi)
    dummy_kategori = pd.DataFrame(ohe.transform([["Taman"]]), columns=ohe.get_feature_names_out(["kategori"]))

    # Gabungkan fitur
    input_df = pd.concat([
        interest_df,
        dummy_kategori,
        pd.DataFrame([[gender, age, 1]], columns=["gender", "age", "count"])
    ], axis=1)

    # Tambah kolom yang mungkin tidak ada
    missing_cols = set(model.feature_names_in_) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0
    input_df = input_df[model.feature_names_in_]

    # Prediksi probabilitas
    proba = model.predict_proba(input_df)[0]
    top_indices = proba.argsort()[::-1][:top_n]
    labels = model.classes_[top_indices]

    # Ambil detail dari wisata_df berdasarkan nama lokasi
    rekomendasi = wisata_df[wisata_df["NameLocation"].isin(labels)]

    # Urutkan berdasarkan urutan label prediksi
    rekomendasi["skor"] = rekomendasi["NameLocation"].apply(lambda x: proba[labels.tolist().index(x)] if x in labels else 0)
    rekomendasi = rekomendasi.sort_values(by="skor", ascending=False)

    return rekomendasi[["NameLocation", "Kategori", "skor"]].head(top_n).values.tolist()


# --- Fungsi untuk user lama ---
def recommend_for_existing_user(user_data, wisata_df, top_n=5):
    user_interest = user_data.get("interest", [])
    searchs = user_data.get("searchs", [])
    searched_places = [s["name"] for s in searchs if isinstance(s, dict) and "name" in s]

    # Rekomendasi berdasarkan interest = kategori dan tempat belum pernah dicari
    rekomendasi = wisata_df[
        wisata_df["Kategori"].isin(user_interest) &
        ~wisata_df["NameLocation"].isin(searched_places)
    ]

    return rekomendasi[["NameLocation", "Kategori"]].drop_duplicates().head(top_n).values.tolist()


In [10]:
# Contoh user baru
print("\n📌 Rekomendasi untuk user baru:")
hasil_baru = recommend_for_new_user(gender=1, age=25, interest_list=["Pantai", "Alam"])

# Karena hasilnya adalah list of tuples yang berisi (Nama Lokasi, Kategori, Skor)
for hasil in hasil_baru:
    nama, kategori, skor = hasil
    print(f"{nama} - Kategori: {kategori}, Skor: {skor:.2f}")


# Contoh user lama
print("\n📌 Rekomendasi untuk user lama:")
user_lama = {
    "interest": ["Sejarah", "Kuliner"],
    "searchs": [{"name": "Candi Borobudur"}, {"name": "Kota Lama Semarang"}]
}
hasil_lama = recommend_for_existing_user(user_lama, wisata_df)
for nama, kategori in hasil_lama:
    print(f"{nama} - Kategori: {kategori}")



📌 Rekomendasi untuk user baru:
Tugu Kupiah Teuku Umar - Kategori: Budaya & Sejarah, Skor: 0.02
Taman Putroe Phang - Kategori: Taman & Alam, Skor: 0.02
Pucok Krueng - Kategori: Gunung, Skor: 0.00
Tapak Tuan Tapa - Kategori: Budaya & Sejarah, Skor: 0.00
Taman Sulthanah Safiatuddin Banda Aceh - Kategori: Taman & Alam, Skor: 0.00

📌 Rekomendasi untuk user lama:
Kolam Renang Mata Ie - Kategori: Kuliner
Natio - Kategori: Kuliner


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rekomendasi["skor"] = rekomendasi["NameLocation"].apply(lambda x: proba[labels.tolist().index(x)] if x in labels else 0)
