In [None]:
! pip install haversine

# K-Means를 위한 데이터 생성

In [None]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

if not firebase_admin._apps:
    cred = credentials.Certificate('petbang-9b8f3-firebase-adminsdk-a7i2o-3cfeaa8688.json') 
    default_app = firebase_admin.initialize_app(cred)

db = firestore.client()

In [None]:
from haversine import haversine

# 위경도 입력

def extract_xy(docs):
  x = []
  y = []
  address = []

  for doc in docs:
    doc_ref = doc.to_dict()
    x.append(float(doc_ref['위도']))
    y.append(float(doc_ref['경도']))
    address.append(doc_ref['주소'])

  return x, y, address

In [None]:
def cal_dist(place, dt_anm, apt_x, apt_y):
  data_anm = dt_anm.where(u"`종류`", u'==', f'{place}').stream()

  anm_x = []
  anm_y = []
  anm_address = []
  aver_dist = []
  anm_num = []

  anm_x, anm_y, anm_address = extract_xy(data_anm)
  for i in range(len(apt_x)):
    tmp = []
    for j in range(len(anm_x)):
      apt_location = (apt_x[i], apt_y[i])
      anm_location = (anm_x[j], anm_y[j])
      location = haversine(apt_location, anm_location, unit ='km')
      if location <= 1:
        tmp.append(location)
    
    if len(tmp) != 0:
      anm_num.append(len(tmp))
      aver_dist.append(sum(tmp) / len(tmp))
    else:
      anm_num.append(0)
      aver_dist.append(0)

  return aver_dist, anm_num

In [None]:
def cal_score(first, second, third):
  final_score = []
  for i in range(len(first)):
    score = 0

    if (first[i] == 0):
      first_tmp = 0
    else:
      first_tmp = 1/first[i]

    if (second[i] == 0):
      second_tmp = 0
    else:
      second_tmp = 1/second[i]
    
    if (third[i] == 0):
      third_tmp = 0
    else:
      third_tmp = 1/third[i]
    
    score = first_tmp * 0.5 + second_tmp * 0.3 + third_tmp * 0.2
    final_score.append(score)

  return final_score

In [None]:
data_apt = db.collection(u'data_apt').stream()

apt_x, apt_y, apt_address = extract_xy(data_apt)
data_anm = db.collection(u'data_anm')
user_data = db.collection(u'user').stream()

for doc in user_data:
  docs = doc.to_dict()
  first_place = docs['first']
  second_place = docs['second']
  third_place = docs['third']

first_aver_dist, first_anm_num = cal_dist(first_place, data_anm, apt_x, apt_y)
second_aver_dist, second_anm_num = cal_dist(second_place, data_anm, apt_x, apt_y)
third_aver_dist, third_anm_num = cal_dist(third_place, data_anm, apt_x, apt_y)

# score가 높을 수록 좋은 매물
final_score = cal_score(first_aver_dist, second_aver_dist, third_aver_dist)

final_df = pd.DataFrame({'주소': apt_address, '첫 번째 요소 평균 거리(km)' : first_aver_dist, '첫 번째 요소 개수' : first_anm_num, '두 번째 요소 평균 거리(km)' : second_aver_dist,
                         '두 번째 요소 개수' : second_anm_num, '세 번째 요소 평균 거리(km)' : third_aver_dist, '세 번째 요소 개수' : third_anm_num, '최종 점수' : final_score})

# K-means 점수 계산

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from skimage import io
import time
import seaborn as sns
from sklearn.cluster import KMeans as KMeans_

# from k_means import KMeans

%matplotlib inline

import warnings

warnings.filterwarnings('ignore')

In [None]:
X = final_df[['최종 점수', '최종 점수']]

plt.close()
plt.style.use('seaborn')
plt.scatter(X, X)
plt.xlabel('최종 점수')
plt.ylabel('최종 점수')

In [None]:
# 최적의 K를 찾기 위한 방법 1 : Elbow Method

def visualize_elbowmethod(data, param_init='random', param_n_init=10, param_max_iter=300):
    distortions = []
    for i in range(1, 10):
        km = KMeans_(n_clusters=i, init=param_init, n_init=param_n_init, max_iter=param_max_iter, random_state=0)
        km.fit(data)
        distortions.append(km.inertia_)

    plt.plot(range(1, 10), distortions, marker='o')
    plt.xlabel('Number of Cluster')
    plt.ylabel('Distortion')
    plt.show()

visualize_elbowmethod(X)

In [None]:
# 최적의 K를 찾기 위한 방법 2-1 : Silhouette Score

from sklearn.metrics import silhouette_score

def visualize_silhouette_layer(data, param_init='random', param_n_init=10, param_max_iter=300):
    clusters_range = range(2,15)
    results = []

    for i in clusters_range:
        clusterer = KMeans_(n_clusters=i, init=param_init, n_init=param_n_init, max_iter=param_max_iter, random_state=0)
        cluster_labels = clusterer.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        results.append([i, silhouette_avg])

    result = pd.DataFrame(results, columns=["n_clusters", "silhouette_score"])
    pivot_km = pd.pivot_table(result, index="n_clusters", values="silhouette_score")
    plt.figure()
    sns.heatmap(pivot_km, annot=True, linewidths=.5, fmt='.3f', cmap=sns.cm._rocket_lut)
    plt.tight_layout()
    plt.show()

visualize_silhouette_layer(X)

In [None]:
# 최적의 K를 찾기 위한 방법 2-2 : Silhouette Score 변형

from sklearn.metrics import silhouette_score

def calculate_silhouette_layer(data, param_init='random', param_n_init=10, param_max_iter=300):
    clusters_range = range(2,15)
    results = []

    for i in clusters_range:
        clusterer = KMeans_(n_clusters=i, init=param_init, n_init=param_n_init, max_iter=param_max_iter, random_state=0)
        cluster_labels = clusterer.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        results.append([i, silhouette_avg])

    result = pd.DataFrame(results, columns=["n_clusters", "silhouette_score"])
    pivot_km = pd.pivot_table(result, index="n_clusters", values="silhouette_score")

    result = result.sort_values('silhouette_score', ascending=False)

    n = result.iloc[0]['n_clusters']
    score = result.iloc[0]['silhouette_score']
    return n, score


check_list = ['동물병원', '동물약국', '산책로', '동물미용', '동물호텔', '동물카페']

from itertools import permutations
check = list(permutations(check_list, 3))
counting = []
dist = []
for i in check:
  first_check = list(i)[0]
  second_check = list(i)[1]
  third_check = list(i)[2]

  first_dist, first_num = cal_dist(first_check, data_anm, apt_x, apt_y)
  second_dist, second_num = cal_dist(second_check, data_anm, apt_x, apt_y)
  third_dist, third_num = cal_dist(third_check, data_anm, apt_x, apt_y)
  score = cal_score(first_dist, second_dist, third_dist)
  final_df = pd.DataFrame({'최종 점수' : score})
  X = final_df[['최종 점수', '최종 점수']]
  n, result = calculate_silhouette_layer(X)
  counting.append(n)
  dist.append(result)

from collections import Counter
cnt = Counter(counting)
cnt

In [None]:
kmeans = KMeans_(n_clusters = 3, max_iter = 500, tol = 0.001, n_init = 100).fit(X) # 여기서 k(n_clusters) 결정

final_df['Rank'] = kmeans.labels_

sns.lmplot('최종 점수', '최종 점수', data=final_df, fit_reg = False, scatter_kws = {"s" : 15}, hue = "Rank")
plt.title('Ranking')