# Import Library

In [76]:
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

# Preprocessing

In [77]:
def split_reviews_per_rating(df):
    # Membuat salinan DataFrame agar data asli tidak terubah
    df_copy = df.copy()
    
    # Ubah string JSON menjadi dictionary
    df_copy['reviews_per_rating'] = df_copy['reviews_per_rating'].apply(lambda x: json.loads(x.replace('null', '0')))
    
    # Pisahkan nilai dari dictionary ke kolom baru
    df_copy['reviews_one_star'] = df_copy['reviews_per_rating'].apply(lambda x: x.get('1', 0))
    df_copy['reviews_two_star'] = df_copy['reviews_per_rating'].apply(lambda x: x.get('2', 0))
    df_copy['reviews_three_star'] = df_copy['reviews_per_rating'].apply(lambda x: x.get('3', 0))
    df_copy['reviews_four_star'] = df_copy['reviews_per_rating'].apply(lambda x: x.get('4', 0))
    df_copy['reviews_five_star'] = df_copy['reviews_per_rating'].apply(lambda x: x.get('5', 0))
    df_copy.drop(columns=['reviews_per_rating'], inplace=True)
    
    return df_copy

In [78]:
def impute_and_pca(df, n_components=3):
    # column untuk imputasi
    cols_to_impute = ['average_hour', 'std_hour', 'avg_popularity']
    
    # Mengganti 'Not Present' dengan NaN
    df.replace('Not Present', np.nan, inplace=True)
    
    # Mengubah kolom yang bersangkutan menjadi tipe data float
    df[cols_to_impute] = df[cols_to_impute].astype(float)
    
    # Inisialisasi KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    
    # Menerapkan KNNImputer hanya pada kolom yang bersangkutan
    df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])

    # column untuk PCA
    eda_cols = ['rating', 'reviews_one_star', 'reviews_two_star', 
            'reviews_three_star', 'reviews_four_star', 
            'reviews_five_star', 'average_hour', 'std_hour', 
            'avg_popularity', 'nearest_competitor_distance', 'nearest_competitor_rating'
            ]
    
    # Inisialisasi MinMaxScaler dan PCA
    scaler = MinMaxScaler()
    pca = PCA(n_components=n_components)
    
    # Normalisasi dan PCA
    matrix_columns = df[eda_cols].values
    normalized_matrix = scaler.fit_transform(matrix_columns)
    df[eda_cols] = normalized_matrix
    principal_components = pca.fit_transform(df[eda_cols])
    
    # Membuat DataFrame dengan komponen utama
    pca_columns = [f'PC{i+1}' for i in range(n_components)]
    matrix_df = pd.DataFrame(data=principal_components, columns=pca_columns)
    
    # Menambahkan informasi place_id dan name ke DataFrame PCA
    pca_df = pd.concat([df[['place_id', 'name']], matrix_df], axis=1)
    
    return pca_df


# Kmeans

In [79]:
def perform_clustering(df, clusters_n=3, iteration_n=100):
    # Kolom untuk clustering
    list_col = ['PC1', 'PC2', 'PC3']

    def update_centroids(points, assignments, clusters_n):
        means = []
        for c in range(clusters_n):
            cluster_points = tf.gather(points, tf.reshape(tf.where(tf.equal(assignments, c)), [-1]))
            mean = tf.reduce_mean(cluster_points, axis=0)
            means.append(mean)
        return tf.stack(means)
    
    # Membuat titik data menggunakan kolom yang telah dideklarasikan pada list_col
    points = df[list_col].values
    
    # Membuat centroid awal dengan mengambil secara acak dari points
    centroids = tf.Variable(tf.slice(tf.random.shuffle(points), [0, 0], [clusters_n, -1]))

    # Loop untuk K-means
    for step in range(iteration_n):
        # Memperluas dimensi titik data dan centroid
        points_expanded = tf.expand_dims(points, 0)
        centroids_expanded = tf.expand_dims(centroids, 1)

        # Menghitung jarak dan menentukan penugasan
        distances = tf.reduce_sum(tf.square(points_expanded - centroids_expanded), axis=2)
        assignments = tf.argmin(distances, axis=0)

        # Memperbarui centroid
        new_centroids = update_centroids(points, assignments, clusters_n)
        centroids.assign(new_centroids)

    # Menyimpan hasil cluster ke dataframe asli
    df['cluster'] = assignments.numpy()

    return df


In [80]:
# df = pd.read_csv('./full_data.csv', nrows=100)
# df.shape

(100, 21)

In [81]:
# df_copy = split_reviews_per_rating(df)
# df_pca = impute_and_pca(df_copy)
# perform_clustering(df_pca)

Unnamed: 0,place_id,name,PC1,PC2,PC3,cluster
0,ChIJd0EvQ4_zaS4ROHLtujX-hmE,Bebek Kaleyo Tebet,1.603264,-0.152479,-0.036636,2
1,ChIJEwwrrXH3aS4RcBo0XDRfOnc,McDonald's Puri Kembangan,1.240783,0.791657,0.306676,2
2,ChIJC_lABffzaS4RZGzb2lD-iAw,Setiabudi One,0.826325,0.262895,-0.113414,2
3,ChIJwZMcGHXxaS4RKdlMwHZXitw,KFC Gunawarman,0.794821,0.613140,0.045217,2
4,ChIJp5VZTgT0aS4R26oBEPT4nUw,Warung MJS,0.628640,0.230313,-0.244146,2
...,...,...,...,...,...,...
95,ChIJqx7pL-L1aS4R95uqPW7IjcQ,NORU Rooftop Lounge Jakarta,-0.249412,-0.052334,-0.285920,0
96,ChIJmSM4FHH3aS4Rlx6b8sEdYvw,KFC,-0.375181,0.123148,-0.235661,0
97,ChIJcS82X8LxaS4RwYBPv9h5RU8,Bamsae bamsae,-0.253617,-0.175538,-0.045787,0
98,ChIJvVPyAonxaS4RO_IwkdRSeaY,Tuang Coffee,-0.432757,-0.167622,0.256594,1
