In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from unidecode import unidecode
import multiprocessing
from itertools import repeat



import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    

is_debug = False
SEED = 2626
num_neighbors = 20
num_split = 5


data_root = '../input/foursquare-location-matching'
train_df = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_root, 'test.csv'))


NAN_STR = ''

if is_debug:
    train_df = train_df.sample(n = 10000, random_state = SEED)
    train_df = train_df.reset_index(drop = True)
    
seed_everything(SEED)

In [3]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)
        

        for k in range(1, neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['dist'] = dists[:, k]
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    return train_df_country

In [4]:
train_pair = recall_knn(train_df, num_neighbors)
train_pair.sample(5)

Start knn grouped by country


  0%|          | 0/221 [00:00<?, ?it/s]

Unnamed: 0,id,match_id,dist
121451,E_7eb63f5b5a593c,E_deac86e9040cfd,0.014827
9957,E_63bf15c42a9fb8,E_3341f3e6e95427,0.008466
327,E_00cdba783a1e66,E_85762772e83e15,0.000857
77191,E_50ab99ffa6e9c9,E_d1543291b06585,0.007898
22300,E_7c075976ac7450,E_f9b1aa57ef9868,0.000536


In [5]:
test_pair = recall_knn(test_df, num_neighbors)
test_pair.sample(min(len(test_pair), 5))

Start knn grouped by country


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,id,match_id,dist
1,E_0283d9f61e569d,E_001b6bad66eb98,0.00907
0,E_001b6bad66eb98,E_0283d9f61e569d,0.00907


In [6]:
%load_ext Cython

In [7]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [8]:
STR_COLS = ['name', 'address', 'url', 'zip', 'phone']
CAT_COLS = ['state', 'country', 'city', 'categories']

# CAT_IDX_MAP = {
#     cat: {} for cat in CAT_COLS
# }

def unidecode_w_sort(s):
    if s == NAN_STR:
        return NAN_STR
    s = unidecode(s)
    s = ' '.join(sorted(s.strip().split(' '))).lower()
    return s


def process_name_address(data):
    data['name'] = data['name'].apply(unidecode_w_sort)
    data['address'] = data['address'].apply(unidecode_w_sort)
    return data


def create_cat_idx_map(data):
    cat_idx_map = {}
    for cat in CAT_COLS:
        values = data[cat].astype(str).unique()
        unique_cat = set()
        for c in values:
            unique_cat |= set([_c.strip() for _c in c.split(',')])
        cat_idx_map[cat] = {
            v:i for i,v in enumerate(unique_cat)
        }
    return cat_idx_map



def convert_cat_features(data, cat_idx_map):
    """
    df: pair_df that has "id, match_id" columns
    data: data_df
    """
    
    # get cat_cols_idx_map
    for col in CAT_COLS:
        if col == 'country':
            data[col] = data[col].astype(str).map(cat_idx_map[col])
        else :
            output = []
            out_num = []
            for val in data[col].astype(str):
                ss = str(val).split(',')
                output.append([cat_idx_map[col][s.strip()] for s in ss])
                out_num.append(len(ss))
            data[col] = output
            data[f'{col}_num'] = out_num
#         else:
            
    return data
        

def add_cat_features(df, data):
    for col in CAT_COLS + [f'{col}_num' for col in CAT_COLS if col != 'country']:
        col_values = data.loc[df['id']][col].values
        matcol_values = data.loc[df['match_id']][col].values
        
        df[f'{col}1'] = col_values
        df[f'{col}2'] = matcol_values
    return df


def add_str_features(df, data):
    """
    df: pair_df that has "id, match_id" columns
    data: data_df
    """
    for col in STR_COLS:
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)

        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != NAN_STR and match_s != NAN_STR:                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(-1)
                levens.append(-1)
                jaros.append(-1)
                lcss.append(-1)

        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss

        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)

            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']

            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()

    df = df.replace(float('-inf'), -1)
    return df


def add_label(pair_df, data_df):
    ids = pair_df['id'].tolist()
    match_ids = pair_df['match_id'].tolist()

    poi = data_df.loc[ids]['point_of_interest'].values
    match_poi = data_df.loc[match_ids]['point_of_interest'].values

    pair_df['label'] = np.array(poi == match_poi, dtype = np.int8)
    return pair_df


def preprocess(main_pair_df, data_df, cat_idx_map, training=True):
    main_pair_df = main_pair_df.reset_index(drop=True)
    data_df = data_df.reset_index(drop=True)
    
    
    data_df = process_name_address(data_df)
    data_df = convert_cat_features(data_df, cat_idx_map)
    data_df = data_df.set_index('id')
    main_pair_df = add_cat_features(main_pair_df, data_df)
    
    if training:
        main_pair_df = add_label(main_pair_df, data_df)
    
    
    num_split = 10
    split_data_len = len(main_pair_df) // num_split
    for i in tqdm(range(num_split)):
        start_idx = i * split_data_len
        if i == num_split - 1:
            end_idx = len(main_pair_df)
        else:
            end_idx = (i+1) * split_data_len
        
        pair_df = main_pair_df.iloc[start_idx: end_idx]
        pair_df = add_str_features(pair_df, data_df)
        
        pair_df.to_csv(f'train_split{i}.csv', index=None)


In [9]:
train_pair = train_pair.copy()
train_df = train_df.copy()
train_df = train_df.fillna(NAN_STR)

In [10]:
cat_idx_map = create_cat_idx_map(train_df)
preprocess(train_pair, train_df, cat_idx_map)

# train_pair2



  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
# # 'E_41daed7a544612'
# df = pd.read_csv('./train_split0.csv')
# df

In [12]:
# train_pair2.label.describe()

In [13]:
# FEAT_START_COLS = ['dist', 'state', 'city', 'country', 'categories', 'name', 'address', 'url', 'zip', 'phone']

# feats_df_cols_map = {}
# for feat in FEAT_START_COLS:
#     feats_df_cols_map[feat] = [col for col in train_pair2.columns if col.startswith(feat)]
# feats_df_cols_map

In [14]:
# def get_weight_sampler(labels):
#     labels = train_pair2.label
#     class_count = labels.value_counts()
#     class_weight = [len(labels) / (len(class_count) * c)  for c in class_count]
#     samples_weight = np.array([class_weight[int(l)] for l in labels])
#     samples_weight = torch.from_numpy(samples_weight)
#     samples_weigth = samples_weight.double()
#     sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
#     return sampler

In [15]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# def process_offset(off):
#     off.pop()
#     off.insert(0, 0)
#     return np.cumsum(off)

# def batch_collate(data):
#     name = []
#     address = []
#     url = []
#     _zip = []
#     phone = []
#     dist = []
#     categories = {
#         'categories_ids1': [],
#         'categories_ids2': [],
#         'categories_offset1': [],
#         'categories_offset2': [],
#     }
#     place = {
#         'country_ids1':[], 'country_ids2':[],
#         'state_ids1':[], 'state_ids2':[], 'state_offset1':[], 'state_offset2':[],       
#         'city_ids1':[], 'city_ids2':[], 'city_offset1':[], 'city_offset2':[]
#     }
    
#     label = []

#     for d in data:
#         name.append(d['name'])
#         address.append(d['address'])
#         url.append(d['url'])
#         _zip.append(d['zip'])
#         phone.append(d['phone'])
#         dist.append(d['dist'])
        
#         categories['categories_ids1'].extend(d['categories']['categories1'])
#         categories['categories_ids2'].extend(d['categories']['categories2'])
#         categories['categories_offset1'].append(d['categories']['categories_num1'])
#         categories['categories_offset2'].append(d['categories']['categories_num2'])
        
#         place['country_ids1'].append(d['country']['country1'])
#         place['country_ids2'].append(d['country']['country2'])
#         place['state_ids1'].extend(d['state']['state1'])
#         place['state_ids2'].extend(d['state']['state2'])
#         place['state_offset1'].append(d['state']['state_num1'])
#         place['state_offset2'].append(d['state']['state_num2'])
#         place['city_ids1'].extend(d['city']['city1'])
#         place['city_ids2'].extend(d['city']['city2'])
#         place['city_offset1'].append(d['city']['city_num1'])
#         place['city_offset2'].append(d['city']['city_num2'])
        
#         label.append(d.get('label', -1))
    
#     categories['categories_offset1'] = process_offset(categories['categories_offset1'])
#     categories['categories_offset2'] = process_offset(categories['categories_offset2'])
#     place['state_offset1'] = process_offset(place['state_offset1'])
#     place['state_offset2'] = process_offset(place['state_offset2'])
#     place['city_offset1'] = process_offset(place['city_offset1'])
#     place['city_offset2'] = process_offset(place['city_offset2'])
            
#     x = {
#         'name': torch.tensor(name).float().to(device),
#         'address': torch.tensor(address).float().to(device),
#         'url': torch.tensor(url).float().to(device),
#         'zip': torch.tensor(_zip).float().to(device),
#         'phone': torch.tensor(phone).float().to(device),
#         'dist': torch.tensor(dist).float().to(device),
#         'categories': {k: torch.tensor(v).long().to(device) for k, v in categories.items()},
#         'place': {k: torch.tensor(v).long().to(device) for k, v in place.items()}
#     }
#     y = torch.tensor(label).float().to(device)
#     return x, y


# class MyDataset(Dataset):
#     def __init__(self, df, feats_df_cols_map, mode='train'):
#         super().__init__()
#         self.df = df.reset_index(drop=True)
#         self.feats_df_cols_map = feats_df_cols_map
#         self.mode = mode
        
#     def __getitem__(self, idx):
#         row = self.df.loc[idx]
#         x = {
#             k: row[v]
#             for k, v in self.feats_df_cols_map.items()
#         }
#         y = None
#         if self.mode == 'train':
#             x['label'] = row['label']
#         return x


# sampler = get_weight_sampler(train_pair2.label)
# train_dataset = MyDataset(train_pair2, feats_df_cols_map, mode='train')    
# train_loader = DataLoader(train_dataset, batch_size=2, sampler=sampler, collate_fn=batch_collate)

In [16]:
# class BaseMLP(nn.Module):
#     def __init__(self, input_dim=32, output_dim=32, dropout=0.2):
#         super().__init__()
#         self.mlp = nn.Sequential(
#             nn.BatchNorm1d(input_dim),
#             nn.utils.weight_norm(nn.Linear(input_dim, input_dim*2)),
#             nn.LeakyReLU(),
#             nn.Dropout(dropout),
            
#             nn.BatchNorm1d(input_dim*2),
#             nn.utils.weight_norm(nn.Linear(input_dim*2, output_dim*2)),
#             nn.LeakyReLU(),
#             nn.Dropout(dropout),
            
#             nn.BatchNorm1d(output_dim*2),
#             nn.utils.weight_norm(nn.Linear(output_dim*2, output_dim)),
#             nn.LeakyReLU(),
#         )
    
#     def forward(self, x):
#         return self.mlp(x)
        


# class CategoriesModule(nn.Module):
#     def __init__(self, num_categories=911, emb_dim=32, out_dim=32):
#         super().__init__()
#         self.embedding_bag = nn.EmbeddingBag(num_categories, emb_dim, mode='mean')
#         self.mlp = BaseMLP(emb_dim*2, out_dim)
        
#     def forward(self, categories_ids1, categories_offset1, categories_ids2, categories_offset2):
#         emb1 = self.embedding_bag(categories_ids1, categories_offset1)
#         emb2 = self.embedding_bag(categories_ids2, categories_offset2)
        
#         emb = torch.cat([emb1, emb2], dim=-1)
#         emb = self.mlp(emb)
#         return emb
    

# class PlaceModule(nn.Module):
#     def __init__(self, emb_dim=32, out_dim=32, num_country=221, num_state=17596, num_city=68105):
#         super().__init__()
#         self.country_embedding = nn.Embedding(num_country, emb_dim)
#         self.state_embedding = nn.EmbeddingBag(num_state, emb_dim, mode='mean')
#         self.city_embedding = nn.EmbeddingBag(num_city, emb_dim, mode='mean')
#         self.mlp = BaseMLP(emb_dim*6, out_dim)
        
    
#     def forward(self, country_ids1, country_ids2,
#                 state_ids1, state_ids2, state_offset1, state_offset2, 
#                 city_ids1, city_ids2, city_offset1, city_offset2):
        
#         country_embs1 = self.country_embedding(country_ids1)
#         country_embs2 = self.country_embedding(country_ids2)
#         state_embs1 = self.state_embedding(state_ids1, state_offset1)
#         state_embs2 = self.state_embedding(state_ids2, state_offset2)
#         city_embs1 = self.city_embedding(city_ids1, city_offset1)
#         city_embs2 = self.city_embedding(city_ids2, city_offset2)

#         embs = torch.cat([country_embs1, country_embs2, state_embs1, state_embs2, city_embs1, city_embs2], dim=-1)
#         embs = self.mlp(embs)
#         return embs
        
    
    
# class MainModel(nn.Module):
#     def __init__(self, feats_df_cols_map, cat_idx_map, module_emb_dim=64):
#         super().__init__()
#         self.name_module = BaseMLP(input_dim=len(feats_df_cols_map['name']), output_dim=module_emb_dim)
#         self.address_module = BaseMLP(input_dim=len(feats_df_cols_map['address']), output_dim=module_emb_dim)
#         self.url_module = BaseMLP(input_dim=len(feats_df_cols_map['url']), output_dim=module_emb_dim)
#         self.zip_module = BaseMLP(input_dim=len(feats_df_cols_map['zip']), output_dim=module_emb_dim)
#         self.phone_module = BaseMLP(input_dim=len(feats_df_cols_map['phone']), output_dim=module_emb_dim)
#         self.distance_module = BaseMLP(input_dim=1, output_dim=module_emb_dim)
        
#         self.categories_module = CategoriesModule(
#             emb_dim=module_emb_dim, out_dim=module_emb_dim,
#             num_categories=len(cat_idx_map['categories'])
#         )
        
#         self.place_module = PlaceModule(
#             emb_dim=module_emb_dim, out_dim=module_emb_dim,
#             num_country=len(cat_idx_map['country']), num_state=len(cat_idx_map['state']), num_city=len(cat_idx_map['city'])
#         )
        
#         self.mlp = BaseMLP(input_dim=module_emb_dim*8, output_dim=module_emb_dim)
#         self.classifier = nn.Sequential(
#             nn.utils.weight_norm(nn.Linear(module_emb_dim, 1)),
#             nn.Sigmoid()
#         )

    
#     def forward(self, x):
#         name_emb = self.name_module(x['name'])
#         address_emb = self.address_module(x['address'])
#         url_emb = self.url_module(x['url'])
#         zip_emb = self.zip_module(x['zip'])
#         phone_emb = self.phone_module(x['phone'])
#         distance_emb = self.distance_module(x['dist'])
#         categories_emb = self.categories_module(**x['categories'])
#         place_emb = self.place_module(**x['place'])
        
#         embs = torch.cat([
#             name_emb,
#             address_emb,
#             url_emb,
#             zip_emb,
#             phone_emb,
#             distance_emb,
#             categories_emb,
#             place_emb,
#         ], dim=-1)
        
#         embs = self.mlp(embs)
#         preds = self.classifier(embs)
#         return preds.squeeze()

# model = MainModel(feats_df_cols_map, cat_idx_map, module_emb_dim=64).to(device)

In [17]:
# for x, y in train_loader:
#     out = model(x)
#     print(out)
#     break