In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import  TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input

# 1. Data Preparation
**Load tourism_data (data_wisata)**
<br>
**Load traveloka_newyearhotels (data_hotel)**

In [None]:
data_wisata = pd.read_csv('tourism_data.csv')
data_hotel = pd.read_csv('traveloka_newyearhotels.csv', on_bad_lines='skip', delimiter=';')


**Cleaning Data**
1. Data hanya terfokus pada kolom city jakarta untuk menyesuaikan dengan aplikasi yang kami buat
2. Menghapus kolom yang tidak diperlukan dalam pengerjaan
3. meng




In [None]:
# Filter data to only include rows where 'city' is 'Jakarta'
data_wisata = data_wisata[data_wisata['City'] == 'Jakarta']
data_hotel = data_hotel[data_hotel['city'] == 'Jakarta']


# Drop unnecessary columns
data_wisata = data_wisata.drop(['Time_Minutes', 'Lat', 'Long', 'Column1', '_1'], axis=1)
data_hotel = data_hotel.drop(['timestamp', 'checkin', 'checkout', 'num_staying_nights', 'id','displayName', 'latitude', 'longitude', 'lowRate','highRate', 'hotelSeoUrl','cheapestRate_allNights_baseFare', 'cheapestRate_allNights_fees', 'cheapestRate_allNights_taxes', 'cheapestRate_allNights_totalFare','cheapestRate_perNight_baseFare', 'cheapestRate_perNight_fees', 'cheapestRate_perNight_taxes', 'cheapestRate_perNight_totalFare', 'originalRate_allNights_baseFare', 'originalRate_allNights_fees','originalRate_allNights_taxes', 'originalRate_allNights_totalFare', 'originalRate_perNight_baseFare', 'originalRate_perNight_fees','originalRate_perNight_taxes'], axis=1)

# Drop rows with missing values
data_wisata = data_wisata.dropna()
data_hotel = data_hotel.dropna()

**bold text**

In [None]:
# Pilih hanya kolom numerik untuk setiap dataset
numerical_cols_hotel = data_hotel.select_dtypes(include=['number'])
numerical_cols_wisata = data_wisata.select_dtypes(include=['number'])

# Hitung kuartil ketiga (Q3) untuk kolom numerik di setiap dataset
q3_values_hotel = numerical_cols_hotel.quantile(0.75)
q3_values_wisata = numerical_cols_wisata.quantile(0.75)

# Ganti nilai 0 dengan Q3 di kolom numerik masing-masing dataset
data_hotel[numerical_cols_hotel.columns] = numerical_cols_hotel.apply(
    lambda col: col.replace(0, q3_values_hotel[col.name])
)
data_wisata[numerical_cols_wisata.columns] = numerical_cols_wisata.apply(
    lambda col: col.replace(0, q3_values_wisata[col.name])
)

# Tampilkan data yang telah dimodifikasi
print(data_hotel.head())
print(data_wisata.head())

# Periksa kolom yang masih memiliki nilai 0
cols_with_zeros_hotel = (data_hotel == 0).sum()
print(cols_with_zeros_hotel[cols_with_zeros_hotel > 0])

cols_with_zeros_wisata = (data_wisata == 0).sum()
print(cols_with_zeros_wisata[cols_with_zeros_wisata > 0])


      city                                  name                   region  \
0  Jakarta  The Sultan Hotel & Residence Jakarta         Senayan, Jakarta   
1  Jakarta               Aryaduta Suite Semanggi  Karet Semanggi, Jakarta   
2  Jakarta                   Aloft South Jakarta  Cilandak Timur, Jakarta   
3  Jakarta                  The Langham, Jakarta         Senayan, Jakarta   
4  Jakarta          Hotel Mulia Senayan, Jakarta         Senayan, Jakarta   

   starRating  userRating  numReviews userRatingInfo  \
0         5.0         8.6      7247.0    Mengesankan   
1         4.0         8.5      4882.0    Mengesankan   
2         4.0         8.9       467.0    Mengesankan   
3         5.0         8.8       340.0    Mengesankan   
4         5.0         8.7      1540.0    Mengesankan   

                                       hotelFeatures  \
0  Bayar saat Check-in, Restoran show cooking, Wi...   
1  Squash, Area main anak, Tennis, Layanan pijat,...   
2  Bar, Sewa mobil, Pusat kebuga

**Membuat file baru dari hasil cleaning data**

In [None]:
# Save the cleaned dataset
data_wisata.to_csv('cleaned_dataset_wisata.csv', index=False)
data_hotel.to_csv('cleaned_dataset_hotel.csv', index=False)

**View New Dataset**

In [None]:
# Load the cleaned dataset
cleaned_data_wisata = pd.read_csv('cleaned_dataset_wisata.csv')
cleaned_data_hotel = pd.read_csv('cleaned_dataset_hotel.csv')

# Print the first 5 rows
print(cleaned_data_wisata.head())
print(cleaned_data_hotel.head())

# Print the shape of the dataset
print(f"The dataset wisata has {cleaned_data_wisata.shape[0]} rows and {cleaned_data_wisata.shape[1]} columns.")
print(f"The dataset hotel has {cleaned_data_hotel.shape[0]} rows and {cleaned_data_hotel.shape[1]} columns.")

   Place_Id                         Place_Name  \
0         1                   Monumen Nasional   
1         2                           Kota Tua   
2         3                      Dunia Fantasi   
3         4  Taman Mini Indonesia Indah (TMII)   
4         5           Atlantis Water Adventure   

                                         Description       Category     City  \
0  Monumen Nasional atau yang populer disingkat d...         Budaya  Jakarta   
1  Kota tua di Jakarta, yang juga bernama Kota Tu...         Budaya  Jakarta   
2  Dunia Fantasi atau disebut juga Dufan adalah t...  Taman Hiburan  Jakarta   
3  Taman Mini Indonesia Indah merupakan suatu kaw...  Taman Hiburan  Jakarta   
4  Atlantis Water Adventure atau dikenal dengan A...  Taman Hiburan  Jakarta   

    Price  Rating                                       Coordinate  \
0   20000      46          {'lat': -6.1753924, 'lng': 106.8271528}   
1   25000      46  {'lat': -6.137644799999999, 'lng': 106.8171245}   
2  27000

Split data into  into train, validation, and test sets

# 2. Pembagian dataset


In [None]:
# Load the cleaned datasets
cleaned_data_wisata = pd.read_csv('cleaned_dataset_wisata.csv')
cleaned_data_hotel = pd.read_csv('cleaned_dataset_hotel.csv')

# Function to split dataset into train, validation, and test sets
def split_dataset(data, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    assert train_size + val_size + test_size == 1.0, "Train, Validation, and Test sizes must sum to 1.0"
    train_data, temp_data = train_test_split(data, test_size=(val_size + test_size), random_state=random_state)
    val_data, test_data = train_test_split(temp_data, test_size=test_size/(val_size + test_size), random_state=random_state)
    return train_data, val_data, test_data

# Process wisata dataset
X_wisata = cleaned_data_wisata.iloc[:, :-1]  # Features
y_wisata = cleaned_data_wisata.iloc[:, -1]   # Target

X_train_wisata, X_temp_wisata, y_train_wisata, y_temp_wisata = train_test_split(
    X_wisata, y_wisata, test_size=0.3, random_state=42)
X_val_wisata, X_test_wisata, y_val_wisata, y_test_wisata = train_test_split(
    X_temp_wisata, y_temp_wisata, test_size=0.5, random_state=42)

# Save wisata splits
X_train_wisata.to_csv('train_features_wisata.csv', index=False)
y_train_wisata.to_csv('train_labels_wisata.csv', index=False)
X_val_wisata.to_csv('val_features_wisata.csv', index=False)
y_val_wisata.to_csv('val_labels_wisata.csv', index=False)
X_test_wisata.to_csv('test_features_wisata.csv', index=False)
y_test_wisata.to_csv('test_labels_wisata.csv', index=False)

# Process hotel dataset
X_hotel = cleaned_data_hotel.iloc[:, :-1]  # Features
y_hotel = cleaned_data_hotel.iloc[:, -1]   # Target

X_train_hotel, X_temp_hotel, y_train_hotel, y_temp_hotel = train_test_split(
    X_hotel, y_hotel, test_size=0.3, random_state=42)
X_val_hotel, X_test_hotel, y_val_hotel, y_test_hotel = train_test_split(
    X_temp_hotel, y_temp_hotel, test_size=0.5, random_state=42)

# Save hotel splits
X_train_hotel.to_csv('train_features_hotel.csv', index=False)
y_train_hotel.to_csv('train_labels_hotel.csv', index=False)
X_val_hotel.to_csv('val_features_hotel.csv', index=False)
y_val_hotel.to_csv('val_labels_hotel.csv', index=False)
X_test_hotel.to_csv('test_features_hotel.csv', index=False)
y_test_hotel.to_csv('test_labels_hotel.csv', index=False)

# Display sample counts
print(f"Training set (Wisata): {len(X_train_wisata)} samples")
print(f"Validation set (Wisata): {len(X_val_wisata)} samples")
print(f"Testing set (Wisata): {len(X_test_wisata)} samples")

print(f"Training set (Hotel): {len(X_train_hotel)} samples")
print(f"Validation set (Hotel): {len(X_val_hotel)} samples")
print(f"Testing set (Hotel): {len(X_test_hotel)} samples")


Training set (Wisata): 58 samples
Validation set (Wisata): 13 samples
Testing set (Wisata): 13 samples
Training set (Hotel): 81 samples
Validation set (Hotel): 18 samples
Testing set (Hotel): 18 samples
