In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
from unidecode import unidecode

from tqdm import tqdm
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F

import pickle

In [2]:
TRAIN_FEATURES = ['kdist',
 'kneighbors',
 'kdist_country',
 'kneighbors_country',
 'name_sim',
 'name_gesh',
 'name_leven',
 'name_jaro',
 'name_lcs',
 'name_len_diff',
 'name_nleven',
 'name_nlcsk',
 'name_nlcs',
 'address_sim',
 'address_gesh',
 'address_leven',
 'address_jaro',
 'address_lcs',
 'address_len_diff',
 'address_nleven',
 'address_nlcsk',
 'address_nlcs',
 'city_gesh',
 'city_leven',
 'city_jaro',
 'city_lcs',
 'city_len_diff',
 'city_nleven',
 'city_nlcsk',
 'city_nlcs',
 'state_sim',
 'state_gesh',
 'state_leven',
 'state_jaro',
 'state_lcs',
 'state_len_diff',
 'state_nleven',
 'state_nlcsk',
 'state_nlcs',
 'zip_gesh',
 'zip_leven',
 'zip_jaro',
 'zip_lcs',
 'url_sim',
 'url_gesh',
 'url_leven',
 'url_jaro',
 'url_lcs',
 'url_len_diff',
 'url_nleven',
 'url_nlcsk',
 'url_nlcs',
 'phone_gesh',
 'phone_leven',
 'phone_jaro',
 'phone_lcs',
 'categories_sim',
 'categories_gesh',
 'categories_leven',
 'categories_jaro',
 'categories_lcs',
 'categories_len_diff',
 'categories_nleven',
 'categories_nlcsk',
 'categories_nlcs',
 'country_sim',
 'country_gesh',
 'country_leven',
 'country_jaro',
 'country_lcs',
 'country_len_diff',
 'country_nleven',
 'country_nlcsk',
 'country_nlcs']

In [3]:
## Parameters
NUM_NEIGHBOR = 20
SEED = 2022
THRESHOLD = 0.77
NUM_SPLIT = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 
               'state', 'url', 'country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [4]:
%load_ext Cython

In [5]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [6]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(1, neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(1, Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [7]:
def unidecode_w_sort(s):
    if s == NAN_STR:
        return NAN_STR
    s = unidecode(s)
    s = ' '.join(sorted(s.strip().split(' '))).lower()
    return s


def add_features(df):
    for col in tqdm(feat_columns):   
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != NAN_STR and match_s != NAN_STR:                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    df = df.replace(float('-inf'), -1)
    df = df.replace(float('inf'), -1)
    df = df.fillna(-1)
    return df

In [8]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in df[['id', 'matches']].values:
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df

In [9]:
## Data load
NAN_STR = ''

data_root = '../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'test.csv')).fillna(NAN_STR)

if len(data) < 20:
    data = pd.read_csv('../input/foursquare-location-matching/train.csv',
                      nrows = 100).fillna(NAN_STR)
    data = data.drop('point_of_interest', axis = 1)

# normalize string
for col in ['name', 'address', 'city', 'state', 'country', 'categories']:
    data[col] = data[col].astype(str).apply(unidecode_w_sort)

In [10]:
data.head(5)

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories
0,E_000001272c6c5d,cafe oudenaarde stad,50.859975,3.634196,abdijstraat,nederename,oost-vlaanderen,9700.0,be,,,bars
1,E_000002eae2a589,carioca manero,-22.907225,-43.178244,,,,,br,,,brazilian restaurants
2,E_000007f24ebc95,raantadphmkaaraaekd,13.780813,100.4849,,,,,th,,,/ barbershops salons
3,E_000008a8ba4f48,turkcell,37.84451,27.844202,adnan bulvari menderes,,,,tr,,,mobile phone shops
4,E_00001d92066153,casa cofino restaurante,43.338196,-4.326821,,caviedes,cantabria,,es,,,restaurants spanish


In [11]:
class CnnEncoder(nn.Module):
    """
    src: https://github.com/baosenguo/Kaggle-MoA-2nd-Place-Solution/blob/main/training/1d-cnn-train.ipynb
    """
    def __init__(self, num_features, num_targets=128, hidden_size=512, dropout=0.3):
        super().__init__()
        cha_1 = 64
        cha_2 = 128
        cha_3 = 128

        cha_1_reshape = int(hidden_size/cha_1)
        cha_po_1 = int(hidden_size/cha_1/2)
        cha_po_2 = int(hidden_size/cha_1/2/2) * cha_3

        self.cha_1 = cha_1
        self.cha_2 = cha_2
        self.cha_3 = cha_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(dropout)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm_c1 = nn.BatchNorm1d(cha_1)
        self.dropout_c1 = nn.Dropout(dropout*0.9)
        self.conv1 = nn.utils.weight_norm(nn.Conv1d(cha_1,cha_2, kernel_size = 5, stride = 1, padding=2,  bias=False),dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = cha_po_1)

        self.batch_norm_c2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2 = nn.Dropout(dropout*0.8)
        self.conv2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_1 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_1 = nn.Dropout(dropout*0.6)
        self.conv2_1 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_2 = nn.Dropout(dropout*0.5)
        self.conv2_2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_3, kernel_size = 5, stride = 1, padding=2, bias=True),dim=None)

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()
        
        self.batch_norm3 = nn.BatchNorm1d(cha_po_2)
        self.dropout3 = nn.Dropout(dropout)
        self.dense3 = nn.utils.weight_norm(nn.Linear(cha_po_2, num_targets))
        
        self.classifier = nn.Sequential(
            nn.Linear(num_targets, 1),
            nn.Sigmoid()
        )

    def forward(self, x):

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.celu(self.dense1(x), alpha=0.06)

        x = x.reshape(x.shape[0],self.cha_1,
                        self.cha_1_reshape)

        x = self.batch_norm_c1(x)
        x = self.dropout_c1(x)
        x = F.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = F.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c2_1(x)
        x = self.dropout_c2_1(x)
        x = F.relu(self.conv2_1(x))

        x = self.batch_norm_c2_2(x)
        x = self.dropout_c2_2(x)
        x = F.relu(self.conv2_2(x))
        x =  x * x_s

        x = self.max_po_c2(x)

        x = self.flt(x)

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        x = self.classifier(x)
        
        return x.squeeze()

In [12]:
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

out_df = pd.DataFrame()
out_df['id'] = data['id'].unique().tolist()
out_df['match_id'] = out_df['id']

test_data = recall_knn(data, NUM_NEIGHBOR)
data = data.set_index('id')

print('Num of unique id: %s' % test_data['id'].nunique())
print('Num of test data: %s' % len(test_data))
print(test_data.sample(5))

Start knn grouped by country


100%|██████████| 32/32 [00:03<00:00,  9.00it/s]

Start knn
Num of unique id: 100
Num of test data: 2067
                    id          match_id      kdist  kneighbors  \
1160  E_0003ffef645c7f  E_00063a791601cc  66.007715        12.0   
546   E_0002efff139ea9  E_00011cca3f0bd6  12.668632         6.0   
650   E_00035714ebb9bd  E_00045931e0bb56  19.699045         7.0   
1581  E_00052849d90a2e  E_0000c362229d93  26.493093        16.0   
1137  E_00028aea37a912  E_00047231d77294  24.293792        12.0   

      kdist_country  kneighbors_country  
1160            NaN                 NaN  
546        1.269391                10.0  
650             NaN                 NaN  
1581            NaN                 NaN  
1137            NaN                 NaN  





In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CnnEncoder(num_features=74).to(device)
model.load_state_dict(torch.load('../input/base-cnn-model/best_model.pth'))

<All keys matched successfully>

In [14]:
BATCH_SIZE = 512
if len(data) < BATCH_SIZE:
    BATCH_SIZE = len(data) // NUM_SPLIT
BATCH_SIZE

20

In [15]:
def run_inference(model, data, qt=None, bs=BATCH_SIZE):
    if qt is None:
        with open('../input/base-qt/qt.pkl', 'rb') as handle:
            qt = pickle.load(handle)
    data = data[TRAIN_FEATURES]
    data = qt.transform(data)
    
    outs = []
    model.eval()
    num_batch = len(data) // bs
    pbar = tqdm(range(num_batch))
    for batch_id in pbar:
        # get data
        start_idx = batch_id * BATCH_SIZE
        end_idx = (batch_id+1) * BATCH_SIZE
        if batch_id < num_batch - 1:
            batch_data = data[start_idx:end_idx]
        else:
            batch_data = data[start_idx:]
        
        batch_data = torch.tensor(batch_data).float().to(device)
        
        # predict
        pred = model(batch_data)
        
        outs += pred.detach().cpu().numpy().tolist()
    return outs

In [16]:
## Prediction
count = 0
start_row = 0
pred_df = pd.DataFrame()
unique_id = test_data['id'].unique().tolist()
num_split_id = len(unique_id) // NUM_SPLIT
for k in range(1, NUM_SPLIT + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < NUM_SPLIT:
        cur_id = unique_id[start_row : end_row]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    
    # add features & model prediction
    cur_data = add_features(cur_data)
    cur_data['pred'] = run_inference(model, cur_data) ####
    cur_pred_df = cur_data[cur_data['pred'] > THRESHOLD][['id', 'match_id']]
    pred_df = pd.concat([pred_df, cur_pred_df])
    
    start_row = end_row
    count += len(cur_data)

    del cur_data, cur_pred_df
    gc.collect()
print(count)

Current split: 1


100%|██████████| 9/9 [00:01<00:00,  7.20it/s]
100%|██████████| 20/20 [00:00<00:00, 159.41it/s]


Current split: 2


100%|██████████| 9/9 [00:01<00:00,  7.58it/s]
100%|██████████| 20/20 [00:00<00:00, 301.48it/s]


Current split: 3


100%|██████████| 9/9 [00:01<00:00,  7.54it/s]
100%|██████████| 20/20 [00:00<00:00, 339.89it/s]


Current split: 4


100%|██████████| 9/9 [00:01<00:00,  7.52it/s]
100%|██████████| 21/21 [00:00<00:00, 302.44it/s]


Current split: 5


100%|██████████| 9/9 [00:01<00:00,  7.43it/s]
100%|██████████| 20/20 [00:00<00:00, 330.94it/s]


2067


In [17]:
## Submission    
out_df = pd.concat([out_df, pred_df])
out_df = out_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
out_df = post_process(out_df)
# out_df['matches'] = [' '.join([a] + b.split(' ')) for a, b in zip(out_df['id'], out_df['matches'])]


print('Unique id: %s' % len(out_df))
print(out_df.head())

out_df[['id', 'matches']].to_csv('submission.csv', index = False)

Unique id: 100
                 id            match_id           matches
0  E_000001272c6c5d  [E_000001272c6c5d]  E_000001272c6c5d
1  E_000002eae2a589  [E_000002eae2a589]  E_000002eae2a589
2  E_000007f24ebc95  [E_000007f24ebc95]  E_000007f24ebc95
3  E_000008a8ba4f48  [E_000008a8ba4f48]  E_000008a8ba4f48
4  E_00001d92066153  [E_00001d92066153]  E_00001d92066153
