# KNN

In [25]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors

from config import cfg

## Data Preprocessing

In [26]:
df = pd.read_csv(cfg.TRAINING_DATA_PATH['KNN'])
df = df[:10000]
df.head()

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb


In [27]:
df = df[['id', 'country', 'latitude', 'longitude']]
print(df.head())
print(len(df))

                 id country   latitude   longitude
0  E_000001272c6c5d      BE  50.859975    3.634196
1  E_000002eae2a589      BR -22.907225  -43.178244
2  E_000007f24ebc95      TH  13.780813  100.484900
3  E_000008a8ba4f48      TR  37.844510   27.844202
4  E_00001d92066153      ES  43.338196   -4.326821
10000


## Define KNN Model
- country 的正確率很高，因此將相同 POI 的必要條件視為相同 country
- haversine: 測地線距離

In [28]:
def country_match_knn(df, num_neighbors=10):
    print('Start KNN grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop=True)

        num_neighbors = min(len(country_df), num_neighbors)
        knn = NearestNeighbors(n_neighbors=num_neighbors,
                               metric='haversine',
                               n_jobs=-1)
        knn.fit(country_df[['latitude', 'longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], return_distance=True)

        for k in range(num_neighbors):
            cur_df = country_df[['id']]
            cur_df = cur_df.assign(match_id=country_df['id'].values[nears[:, k]],
                          kdist_country=dists[:, k],
                          kneighbors_country=k)

            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    print(train_df_country[train_df_country['kneighbors_country']==1])
    return train_df_country

In [29]:
knn_df = country_match_knn(df)

Start KNN grouped by country


100%|██████████| 125/125 [00:00<00:00, 142.27it/s]

                  id          match_id  kdist_country  kneighbors_country
0   E_0013149ad8ed18  E_00c1216c65c1a5       0.065759                   1
1   E_0019ce8998b3a8  E_013fdbb05c8107       0.020376                   1
2   E_002e4c4793fbc7  E_01b35767eb9bc5       0.110124                   1
3   E_0030de94b528e7  E_0013149ad8ed18       0.079380                   1
4   E_0031c764c706c6  E_0176cc3cf6c966       0.016267                   1
5   E_005661be0c9b52  E_012d7bb6a3a554       0.001173                   1
6   E_0069f71ef3f0e7  E_0117b75aff7fcb       0.611016                   1
7   E_006cc2b2a1ce9a  E_01de2a08b62351       0.028557                   1
8   E_006d916aa271d3  E_02451350cead62       0.158503                   1
9   E_0076e00a9cf084  E_01729db6614527       0.014613                   1
10  E_0091956d4df68f  E_013fdbb05c8107       0.002776                   1
11  E_00a55713b33416  E_01b7aec94b03ef       0.002494                   1
12  E_00c1216c65c1a5  E_0237f1ca3ed586




## Build Decision Tree features

In [30]:
knn_df.head()

Unnamed: 0,id,match_id,kdist_country,kneighbors_country
0,E_0013149ad8ed18,E_0013149ad8ed18,0.0,0
1,E_0019ce8998b3a8,E_0019ce8998b3a8,0.0,0
2,E_002e4c4793fbc7,E_002e4c4793fbc7,0.0,0
3,E_0030de94b528e7,E_0030de94b528e7,0.0,0
4,E_0031c764c706c6,E_0031c764c706c6,0.0,0


In [34]:
df = pd.read_csv(cfg.TRAINING_DATA_PATH['KNN'])
df = df[:10000]
df = df[['id', 'name', 'categories', 'city', 'state', 'zip']]
df.head()

Unnamed: 0,id,name,categories,city,state,zip
0,E_000001272c6c5d,Café Stad Oudenaarde,Bars,Nederename,Oost-Vlaanderen,9700.0
1,E_000002eae2a589,Carioca Manero,Brazilian Restaurants,,,
2,E_000007f24ebc95,ร้านตัดผมการาเกด,Salons / Barbershops,,,
3,E_000008a8ba4f48,Turkcell,Mobile Phone Shops,,,
4,E_00001d92066153,Restaurante Casa Cofiño,Spanish Restaurants,Caviedes,Cantabria,


In [35]:
df['matches'] = df['id'].apply(lambda x: list(knn_df[knn_df['id'] == x]['match_id']))
df['dists'] = df['id'].apply(lambda x: list(knn_df[knn_df['id'] == x]['kdist_country']))
df[df['matches'].map(len) > 1]

Unnamed: 0,id,name,categories,city,state,zip,matches,dists
320,E_0013149ad8ed18,جامع الكرامة,Mosques,Shakhbout,,,"[E_0013149ad8ed18, E_00c1216c65c1a5, E_0030de9...","[0.0, 0.06575899037071478, 0.07938048081969103..."
408,E_0019ce8998b3a8,Cinnabon سينابون,"Bakeries, Donut Shops",Dubai,,,"[E_0019ce8998b3a8, E_013fdbb05c8107, E_0091956...","[0.0, 0.02037580903714799, 0.02149892883556881..."
762,E_002e4c4793fbc7,S.P. Warehousing,"Real Estate Offices, Warehouse Stores",Dubai,,,"[E_002e4c4793fbc7, E_01b35767eb9bc5, E_00e4afe...","[0.0, 0.1101243284578086, 0.11118940202623383,..."
794,E_0030de94b528e7,Al Derwaza,Middle Eastern Restaurants,Abu Dhabi,Abu Dhabi,,"[E_0030de94b528e7, E_0013149ad8ed18, E_01de2a0...","[0.0, 0.07938048081969103, 0.10366591128340158..."
808,E_0031c764c706c6,Bowling Marina Mall,Bowling Alleys,,,,"[E_0031c764c706c6, E_0176cc3cf6c966, E_022f45f...","[0.0, 0.016267134943035843, 0.0288063339731320..."
1450,E_005661be0c9b52,Costa festival city,,Dubai,Dubai,,"[E_005661be0c9b52, E_012d7bb6a3a554, E_01d948f...","[0.0, 0.0011728014637258227, 0.002539126390733..."
1757,E_0069f71ef3f0e7,Costa Coffee,Coffee Shops,Ras al-Khaimah,Ra’s al Khaymah,,"[E_0069f71ef3f0e7, E_0117b75aff7fcb, E_02383bd...","[0.0, 0.6110163016564931, 0.6644024436819523, ..."
1803,E_006cc2b2a1ce9a,Anantara Hotel,Hotels,,,,"[E_006cc2b2a1ce9a, E_01de2a08b62351, E_0031c76...","[0.0, 0.028557175210048108, 0.0726438672189792..."
1823,E_006d916aa271d3,بالشارع,Buildings,Sharjah,,,"[E_006d916aa271d3, E_02451350cead62, E_0102160...","[0.0, 0.1585029171825895, 0.17328351943459508,..."
2001,E_0076e00a9cf084,SEGA Republic,"Arcades, Toy / Game Stores, General Entertainment",Downtown Dubai,Dubai,,"[E_0076e00a9cf084, E_01729db6614527, E_01e41bc...","[0.0, 0.014613009494782, 0.026275538078677923,..."
