In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle
from unidecode import unidecode

from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
## Parameters
is_debug = False
SEED = 2626
num_neighbors = 10
num_split = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 
               'state', 'url', 'country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [5]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(1, neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(1, Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [6]:
def unidecode_w_sort(s):
    if s == NAN_STR:
        return NAN_STR
    s = unidecode(s)
    s = ' '.join(sorted(s.strip().split(' '))).lower()
    return s


# def process_name_address(data, col):
#     data['name'] = data['name'].astype(str).apply(unidecode_w_sort)
#     return data


def add_features(df):
    for col in tqdm(feat_columns):   
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != NAN_STR and match_s != NAN_STR:                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
    return df

In [7]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [8]:
## Data load
NAN_STR = ''

data_root = '../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv')).fillna(NAN_STR)

if is_debug:
    data = data.sample(n = 10000, random_state = SEED)
    data = data.reset_index(drop = True)

# normalize string
for col in ['name', 'address', 'city', 'state', 'country', 'categories']:
    data[col] = data[col].astype(str).apply(unidecode_w_sort)

In [9]:
data.head(5)

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,cafe oudenaarde stad,50.859975,3.634196,abdijstraat,nederename,oost-vlaanderen,9700.0,be,,,bars,P_677e840bb6fc7e
1,E_000002eae2a589,carioca manero,-22.907225,-43.178244,,,,,br,,,brazilian restaurants,P_d82910d8382a83
2,E_000007f24ebc95,raantadphmkaaraaekd,13.780813,100.4849,,,,,th,,,/ barbershops salons,P_b1066599e78477
3,E_000008a8ba4f48,turkcell,37.84451,27.844202,adnan bulvari menderes,,,,tr,,,mobile phone shops,P_b2ed86905a4cd3
4,E_00001d92066153,casa cofino restaurante,43.338196,-4.326821,,caviedes,cantabria,,es,,,restaurants spanish,P_809a884d4407fb


In [10]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, 
                                                data['point_of_interest'], 
                                                data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))
      
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d)

del train_data, valid_data
gc.collect()

data = data.set_index('id')
data = data.loc[tv_ids_d['train_ids']]
data = data.reset_index()

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64
Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
set()
set()


In [11]:
## Train data generated by knn
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

train_data = recall_knn(data, num_neighbors)

data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values

train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())
print(train_data.sample(5))

Start knn grouped by country


  0%|          | 0/211 [00:00<?, ?it/s]

Start knn
Num of unique id: 569406
Num of train data: 6251522
Pos rate: 0.061014261806964766
                       id          match_id     kdist  kneighbors  \
5802446  E_6a7c23052451ff  E_f7c329aeb49b86       NaN         NaN   
1643275  E_e2a5a8c16dee32  E_b2caf917915008  0.007868         3.0   
347803   E_9c183263310032  E_38e50595775eab  0.000140         1.0   
5749003  E_8709207737efbd  E_4a2869143c6ec8       NaN         NaN   
1282678  E_4093e864d49e37  E_dd9a414be6c470  0.000433         3.0   

         kdist_country  kneighbors_country  label  
5802446       0.003339                 8.0      0  
1643275       0.006031                 3.0      0  
347803        0.000049                 1.0      0  
5749003       0.002248                 4.0      0  
1282678       0.000390                 3.0      0  


In [12]:
## Eval
data = data.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 569406
Unique id: 569406
IoU score: 0.9000914917865213


In [13]:
def process_with_pickle(df, save_path):
    df.drop(columns=['id', 'match_id'], inplace=True)
    label = df.pop('label')
    df.insert(0, 'label', label)
    df = df.values
    df = np.unique(df, axis=0)
    
    with open(save_path, 'wb') as f:
        pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
columns = []

## Add features
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    
    cur_data = add_features(cur_data)
    cur_data['kdist_diff'] = (cur_data['kdist'] - cur_data['kdist_country']) /\
                                cur_data['kdist_country']
    cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis = 1)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    columns = cur_data.columns
    process_with_pickle(cur_data, f'train_split{k}.pkl')
#     cur_data.to_csv('train_data%s.csv' % k, index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)

Current split: 1


  0%|          | 0/9 [00:00<?, ?it/s]

(1249379, 79)
                       id          match_id  kdist  kneighbors  kdist_country  \
6121634  E_29aaeefeb850b0  E_44699afd5f6299    NaN         NaN       0.008537   

         kneighbors_country  label  name_sim  name_gesh  name_leven  ...  \
6121634                 8.0      0       0.0   0.173913        10.0  ...   

         country_gesh  country_leven  country_jaro  country_lcs  \
6121634           1.0            0.0           1.0          2.0   

         country_len_diff  country_nleven  country_nlcsk  country_nlcs  \
6121634                 0             0.0            1.0           1.0   

         kdist_diff  kneighbors_mean  
6121634         NaN              8.0  

[1 rows x 79 columns]
Current split: 2


  0%|          | 0/9 [00:00<?, ?it/s]

(1250861, 79)
                       id          match_id     kdist  kneighbors  \
2967758  E_362e1016e7b7a6  E_9a809f1dd19d4a  0.004072         6.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
2967758            NaN                 NaN      0  0.147526   0.115942   

         name_leven  ...  country_gesh  country_leven  country_jaro  \
2967758        36.0  ...           1.0            0.0           1.0   

         country_lcs  country_len_diff  country_nleven  country_nlcsk  \
2967758          2.0                 0             0.0            1.0   

         country_nlcs  kdist_diff  kneighbors_mean  
2967758           1.0         NaN              6.0  

[1 rows x 79 columns]
Current split: 3


  0%|          | 0/9 [00:00<?, ?it/s]

(1252660, 79)
                       id          match_id     kdist  kneighbors  \
1938125  E_672b8ae139c60b  E_4799ba4c05f986  0.004094         4.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
1938125       0.001556                 9.0      0       0.0   0.235294   

         name_leven  ...  country_gesh  country_leven  country_jaro  \
1938125        19.0  ...           1.0            0.0           1.0   

         country_lcs  country_len_diff  country_nleven  country_nlcsk  \
1938125          2.0                 0             0.0            1.0   

         country_nlcs  kdist_diff  kneighbors_mean  
1938125           1.0    1.631348              6.5  

[1 rows x 79 columns]
Current split: 4


  0%|          | 0/9 [00:00<?, ?it/s]

(1249559, 79)
                       id          match_id  kdist  kneighbors  kdist_country  \
5388087  E_9c03f3f0f3da0d  E_05312c166da5f6    NaN         NaN       0.000266   

         kneighbors_country  label  name_sim  name_gesh  name_leven  ...  \
5388087                 7.0      0       0.0   0.275862        14.0  ...   

         country_gesh  country_leven  country_jaro  country_lcs  \
5388087           1.0              0           1.0            2   

         country_len_diff  country_nleven  country_nlcsk  country_nlcs  \
5388087                 0             0.0            1.0           1.0   

         kdist_diff  kneighbors_mean  
5388087         NaN              7.0  

[1 rows x 79 columns]
Current split: 5


  0%|          | 0/9 [00:00<?, ?it/s]

(1249063, 79)
                       id          match_id  kdist  kneighbors  kdist_country  \
5125310  E_fe1a31b5664ed4  E_dd78429f630d58    NaN         NaN       2.432267   

         kneighbors_country  label  name_sim  name_gesh  name_leven  ...  \
5125310                 9.0      0       0.0       0.16        15.0  ...   

         country_gesh  country_leven  country_jaro  country_lcs  \
5125310           1.0            0.0           1.0          2.0   

         country_len_diff  country_nleven  country_nlcsk  country_nlcs  \
5125310                 0             0.0            1.0           1.0   

         kdist_diff  kneighbors_mean  
5125310         NaN              9.0  

[1 rows x 79 columns]
6251522


In [15]:
list(columns)

['id',
 'match_id',
 'kdist',
 'kneighbors',
 'kdist_country',
 'kneighbors_country',
 'label',
 'name_sim',
 'name_gesh',
 'name_leven',
 'name_jaro',
 'name_lcs',
 'name_len_diff',
 'name_nleven',
 'name_nlcsk',
 'name_nlcs',
 'address_sim',
 'address_gesh',
 'address_leven',
 'address_jaro',
 'address_lcs',
 'address_len_diff',
 'address_nleven',
 'address_nlcsk',
 'address_nlcs',
 'city_gesh',
 'city_leven',
 'city_jaro',
 'city_lcs',
 'city_len_diff',
 'city_nleven',
 'city_nlcsk',
 'city_nlcs',
 'state_sim',
 'state_gesh',
 'state_leven',
 'state_jaro',
 'state_lcs',
 'state_len_diff',
 'state_nleven',
 'state_nlcsk',
 'state_nlcs',
 'zip_gesh',
 'zip_leven',
 'zip_jaro',
 'zip_lcs',
 'url_sim',
 'url_gesh',
 'url_leven',
 'url_jaro',
 'url_lcs',
 'url_len_diff',
 'url_nleven',
 'url_nlcsk',
 'url_nlcs',
 'phone_gesh',
 'phone_leven',
 'phone_jaro',
 'phone_lcs',
 'categories_sim',
 'categories_gesh',
 'categories_leven',
 'categories_jaro',
 'categories_lcs',
 'categories_len_di