In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm import tqdm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [2]:
TRAIN_FEATURES = [
 'kdist',
 'kneighbors',
 'kdist_country',
 'kneighbors_country',
 'name_sim',
 'name_gesh',
 'name_leven',
 'name_jaro',
 'name_lcs',
 'name_len_diff',
 'name_nleven',
 'name_nlcsk',
 'name_nlcs',
 'address_sim',
 'address_gesh',
 'address_leven',
 'address_jaro',
 'address_lcs',
 'address_len_diff',
 'address_nleven',
 'address_nlcsk',
 'address_nlcs',
 'city_gesh',
 'city_leven',
 'city_jaro',
 'city_lcs',
 'city_len_diff',
 'city_nleven',
 'city_nlcsk',
 'city_nlcs',
 'state_sim',
 'state_gesh',
 'state_leven',
 'state_jaro',
 'state_lcs',
 'state_len_diff',
 'state_nleven',
 'state_nlcsk',
 'state_nlcs',
 'zip_gesh',
 'zip_leven',
 'zip_jaro',
 'zip_lcs',
 'url_sim',
 'url_gesh',
 'url_leven',
 'url_jaro',
 'url_lcs',
 'url_len_diff',
 'url_nleven',
 'url_nlcsk',
 'url_nlcs',
 'phone_gesh',
 'phone_leven',
 'phone_jaro',
 'phone_lcs',
 'categories_sim',
 'categories_gesh',
 'categories_leven',
 'categories_jaro',
 'categories_lcs',
 'categories_len_diff',
 'categories_nleven',
 'categories_nlcsk',
 'categories_nlcs',
 'country_sim',
 'country_gesh',
 'country_leven',
 'country_jaro',
 'country_lcs',
 'country_len_diff',
 'country_nleven',
 'country_nlcsk',
 'country_nlcs']

TRAIN_FEATURES2 = [
 'kdist',
 'kneighbors',
 'kdist_country',
 'kneighbors_country',
 'name_sim',
 'name_gesh',
 'name_leven',
 'name_jaro',
 'name_lcs',
 'name_len_diff',
 'name_nleven',
 'name_nlcsk',
 'name_nlcs',
 'address_sim',
 'address_gesh',
 'address_leven',
 'address_jaro',
 'address_lcs',
 'address_len_diff',
 'address_nleven',
 'address_nlcsk',
 'address_nlcs',
 'city_gesh',
 'city_leven',
 'city_jaro',
 'city_lcs',
 'city_len_diff',
 'city_nleven',
 'city_nlcsk',
 'city_nlcs',
 'state_sim',
 'state_gesh',
 'state_leven',
 'state_jaro',
 'state_lcs',
 'state_len_diff',
 'state_nleven',
 'state_nlcsk',
 'state_nlcs',
 'zip_gesh',
 'zip_leven',
 'zip_jaro',
 'zip_lcs',
 'url_sim',
 'url_gesh',
 'url_leven',
 'url_jaro',
 'url_lcs',
 'url_len_diff',
 'url_nleven',
 'url_nlcsk',
 'url_nlcs',
 'phone_gesh',
 'phone_leven',
 'phone_jaro',
 'phone_lcs',
 'categories_sim',
 'categories_gesh',
 'categories_leven',
 'categories_jaro',
 'categories_lcs',
 'categories_len_diff',
 'categories_nleven',
 'categories_nlcsk',
 'categories_nlcs',
 'country_sim',
 'country_gesh',
 'country_leven',
 'country_jaro',
 'country_lcs',
 'country_len_diff',
 'country_nleven',
 'country_nlcsk',
 'country_nlcs',
 'kdist_diff',
 'kneighbors_mean']

In [3]:
## Parameters
NUM_FOLD = 3
NUM_NEIGHBOR = 21
SEED = 2626
THRESHOLD = 0.5
NUM_SPLIT = 6
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 
               'state', 'url', 'country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [4]:
%load_ext Cython

In [5]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [6]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in df[['id', 'matches']].values:
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df 

In [7]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [8]:
def unidecode_w_sort(s):
    if s == NAN_STR:
        return NAN_STR
    s = unidecode(s)
    s = ' '.join(sorted(s.strip().split(' '))).lower()
    return s

def add_features(df):    
    for col in tqdm(feat_columns):       
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = np.array(tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1)).ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != NAN_STR and match_s != NAN_STR:                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [9]:
## Dada process
NAN_STR = ''

data = pd.read_csv('../input/foursquare-location-matching/test.csv').fillna(NAN_STR)

if len(data) < 20:
    data = pd.read_csv('../input/foursquare-location-matching/train.csv',
                      nrows = 100).fillna(NAN_STR)
    data = data.drop('point_of_interest', axis = 1)
    
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

out_df = pd.DataFrame()
out_df['id'] = data['id'].unique().tolist()
out_df['match_id'] = out_df['id']

test_data = recall_knn(data, NUM_NEIGHBOR)
data = data.set_index('id')

print('Num of unique id: %s' % test_data['id'].nunique())
print('Num of test data: %s' % len(test_data))
print(test_data.sample(5))

Start knn grouped by country


100%|██████████| 32/32 [00:03<00:00,  8.80it/s]

Start knn
Num of unique id: 100
Num of test data: 2260
                    id          match_id      kdist  kneighbors  \
540   E_000292dce833cc  E_00014f107dc217   5.024913         5.0   
135   E_000288f2046ce3  E_000260dd24cecf   0.750322         1.0   
1378  E_0004ea391c9404  E_00053bda88d7d8  10.136039        13.0   
1736  E_00028abacfbca0  E_00015cd7e0227f  12.982930        17.0   
1225  E_0001a397f67ad5  E_00045931e0bb56  10.238297        12.0   

      kdist_country  kneighbors_country  
540        1.448241                 5.0  
135        0.581366                 1.0  
1378       1.711920                17.0  
1736       1.484000                 7.0  
1225            NaN                 NaN  





In [10]:
paths = []
weights = []
for i in range(3):
    paths.append(f'../input/lgbmversion2/lgb{i}.pkl')
    weights.append(0.5)
for i in range(5):
    paths.append(f'../input/my-lgb-model/lgb{i}.pkl')
    weights.append(1)
for i in range(5):
    paths.append(f'../input/lgbgetall1all008/lgb{i}.pkl')
    weights.append(0.7)
for i in range(5):
    paths.append(f'../input/lgbmnestimators1600/lgb{i}.pkl')
    weights.append(0.7)

## Model load
models = []
for path in paths:
    with open(path, 'rb') as f:
        model = pickle.load(f)
    models.append(model)
print(len(models))


new_paths = []
for i in range(5):
    new_paths.append(f'../input/lgb-new-train/lgb{i}.pkl')
    weights.append(2)

new_models = []
for path in new_paths:
    with open(path, 'rb') as f:
        model = pickle.load(f)
    new_models.append(model)
print(len(new_models))
print(weights)

18
5
[0.5, 0.5, 0.5, 1, 1, 1, 1, 1, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 2, 2, 2, 2, 2]


In [11]:
## Prediction
count = 0
start_row = 0
pred_df = pd.DataFrame()
unique_id = test_data['id'].unique().tolist()
num_split_id = len(unique_id) // NUM_SPLIT
for k in range(1, NUM_SPLIT + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < NUM_SPLIT:
        cur_id = unique_id[start_row : end_row]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    
    # add features & model prediction
    cur_data = add_features(cur_data)
    cur_data['kdist_diff'] = (cur_data['kdist'] - cur_data['kdist_country']) /\
                                cur_data['kdist_country']
    cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis = 1)
    
    preds = []
    
    for model in new_models:
        pred = model.predict_proba(cur_data[TRAIN_FEATURES2])[:, [1]]
        preds.append(pred)
        
    
    cur_data = cur_data.replace(float('-inf'), -1)
    cur_data = cur_data.replace(float('inf'), -1)
    cur_data = cur_data.fillna(-1)
    for model in models:
        pred = model.predict_proba(cur_data[TRAIN_FEATURES])[:, [1]]
        preds.append(pred)
        
    
    preds = np.average(np.concatenate(preds, axis=1), axis=1, weights=weights).reshape(-1)
    cur_data['pred'] = preds
    
    cur_pred_df = cur_data[cur_data['pred'] > THRESHOLD][['id', 'match_id']]
    pred_df = pd.concat([pred_df, cur_pred_df])
    
    start_row = end_row
    count += len(cur_data)

    del cur_data, cur_pred_df
    gc.collect()
print(count)

Current split: 1


100%|██████████| 9/9 [00:01<00:00,  7.66it/s]


Current split: 2


100%|██████████| 9/9 [00:01<00:00,  7.72it/s]


Current split: 3


100%|██████████| 9/9 [00:01<00:00,  7.74it/s]


Current split: 4


100%|██████████| 9/9 [00:01<00:00,  7.98it/s]


Current split: 5


100%|██████████| 9/9 [00:01<00:00,  7.64it/s]


Current split: 6


100%|██████████| 9/9 [00:01<00:00,  7.54it/s]


2260


In [12]:
## Submission    
out_df = pd.concat([out_df, pred_df])
out_df = out_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
out_df = post_process(out_df)
print('Unique id: %s' % len(out_df))
print(out_df.head())

out_df[['id', 'matches']].to_csv('submission.csv', index = False)

Unique id: 100
                 id                              match_id           matches
0  E_000001272c6c5d  [E_000001272c6c5d, E_000001272c6c5d]  E_000001272c6c5d
1  E_000002eae2a589  [E_000002eae2a589, E_000002eae2a589]  E_000002eae2a589
2  E_000007f24ebc95  [E_000007f24ebc95, E_000007f24ebc95]  E_000007f24ebc95
3  E_000008a8ba4f48  [E_000008a8ba4f48, E_000008a8ba4f48]  E_000008a8ba4f48
4  E_00001d92066153  [E_00001d92066153, E_00001d92066153]  E_00001d92066153
