In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/foursquare-location-matching/sample_submission.csv
/kaggle/input/foursquare-location-matching/pairs.csv
/kaggle/input/foursquare-location-matching/train.csv
/kaggle/input/foursquare-location-matching/test.csv
/kaggle/input/fstraindataset/train_data5.csv
/kaggle/input/fstraindataset/train_data7.csv
/kaggle/input/fstraindataset/train_data3.csv
/kaggle/input/fstraindataset/train_data2.csv
/kaggle/input/fstraindataset/train_data1.csv
/kaggle/input/fstraindataset/train_data4.csv
/kaggle/input/fstraindataset/train_data6.csv


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
import gc

import Levenshtein
import difflib

import sys
from pathlib import Path
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
target = 'point_of_interest'
num_split = 7
feat_columns = ['name', 'address', 'city', 'state', 'zip',
               'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'country',
              'address', 'state', 'url']
ROOT = Path.cwd().parent
INPUT = ROOT/'input'
DATA = INPUT/'foursquare-location-matching'
TDATA = INPUT/'fstraindataset'
WORK = ROOT/'working'

In [4]:
def recall_knn(df, Neighbors = 10):
    #Start knn grouped by country
    print('group')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop=True)
        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors=neighbors,
                                  metric='haversine',
                                  n_jobs=-1)
        knn.fit(country_df[['latitude', 'longitude']], country_df.index)
        
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], return_distance=True)
        for k in range(neighbors):
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:,k]]
            cur_df['kdist_country'] = dists[:,k]
            cur_df['kneighbors_country'] = k
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    #start knn
    print('knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors=Neighbors)
    knn.fit(df[['latitude', 'longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude', 'longitude']])
    
    for k in range(Neighbors):
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:,k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                              on=['id', 'match_id'],
                              how='outer')
    del train_df_country
    return train_df

In [5]:
def add_features(df):
    for col in tqdm(feat_columns):
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]
            df[f'{col}_sim'] = np.array(tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis=1)).ravel()
        col_values = data.loc[df['id']][col].values.astype(str)
        match_values = data.loc[df['match_id']][col].values.astype(str)
        geshs = []
        levens = []
        jaros = []
        
        for s, match_s in zip(col_values, match_values):
            if s != 'nan' and match_s != 'nan':
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, match_values))
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis=1)
            
            df = df.drop(f'{col}_len', axis=1)
            df = df.drop(f'match_{col}_len', axis=1)
            gc.collect()
    return df

In [6]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))
    for base, match in df[['id', 'matches']].values:
        match =match.split()
        if len(match) == 1:
            continue
        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df

In [10]:
import lightgbm as lgbm

In [11]:
gb_clf = []
for i in tqdm(range(1, num_split+1)):
    train = pd.read_csv(TDATA/f'train_data{i}.csv', index_col='id')
    y_train = train['label']
    train.drop(labels=['match_id', 'label'], axis=1, inplace=True)
    train = train.fillna(0)
    gb_clf_ = lgbm.LGBMClassifier(
                                  num_leaves=15, #35
                                  max_depth=5, #7
                                  learning_rate=0.12, #0.2,
                                  n_estimators=1000, #10000,
                                  reg_alpha=0.1,
                                  reg_lambda=0.1,
                                  random_state=42,
                                  colsample_bytree=0.9,
                                  n_jobs=-1)
    gb_clf_.fit(train, y_train)
    gb_clf.append(gb_clf_)
    del train, y_train

100%|██████████| 7/7 [15:00<00:00, 128.65s/it]


In [12]:
def pred_clf(models, df):
    pred = np.array([model.predict(df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [13]:
data = pd.read_csv(DATA/'test.csv')
if len(data) < 20:
    data = pd.read_csv(DATA/'train.csv', nrows=100)
    data = data.drop(target, axis=1)
id2index_d = dict(zip(data['id'].values, data.index))
tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit
out_df = pd.DataFrame()
out_df['id'] = data['id'].unique().tolist()
out_df['match_id'] = out_df['id']
test_data = recall_knn(data, n_neighbors)
data = data.set_index('id')

group


100%|██████████| 32/32 [00:00<00:00, 97.39it/s] 

knn





In [14]:
count = 0
start_row = 0
pred_df = pd.DataFrame()
unique_id = test_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split
for k in range(1, num_split+1):
    print('curent split %s' %k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row: end_row]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row:]
        cur_data = test_data[test_data['id'].isin(cur_id)]
    cur_data = add_features(cur_data)
    cur_data['kdist_diff'] = (cur_data['kdist'] - cur_data['kdist_country']) / cur_data['kdist_country']
    cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis=1)
    cur_data = cur_data.fillna(0)
    feature = cur_data.columns.tolist()[2:]
    cur_data['pred'] = pred_clf(gb_clf, cur_data[feature])
    cur_pred_df = cur_data[cur_data['pred'] > 0.55][['id', 'match_id']]
    pred_df = pd.concat([pred_df, cur_pred_df])
    start_row = end_row
    count += len(cur_data)
    del cur_data, cur_pred_df
    gc.collect()
print(count)

curent split 1


100%|██████████| 9/9 [00:01<00:00,  7.25it/s]


curent split 2


100%|██████████| 9/9 [00:01<00:00,  7.94it/s]


curent split 3


100%|██████████| 9/9 [00:01<00:00,  8.28it/s]


curent split 4


100%|██████████| 9/9 [00:01<00:00,  8.14it/s]


curent split 5


100%|██████████| 9/9 [00:01<00:00,  7.80it/s]


curent split 6


100%|██████████| 9/9 [00:01<00:00,  8.07it/s]


curent split 7


100%|██████████| 9/9 [00:01<00:00,  8.21it/s]


1171


In [15]:
out_df = pd.concat([out_df, pred_df])
out_df = out_df.groupby('id')['match_id'].apply(list).reset_index()
out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
out_df = post_process(out_df)
print(out_df.head())
out_df[['id', 'matches']].to_csv('submission.csv', index=False)

                 id                              match_id           matches
0  E_000001272c6c5d  [E_000001272c6c5d, E_000001272c6c5d]  E_000001272c6c5d
1  E_000002eae2a589  [E_000002eae2a589, E_000002eae2a589]  E_000002eae2a589
2  E_000007f24ebc95  [E_000007f24ebc95, E_000007f24ebc95]  E_000007f24ebc95
3  E_000008a8ba4f48  [E_000008a8ba4f48, E_000008a8ba4f48]  E_000008a8ba4f48
4  E_00001d92066153  [E_00001d92066153, E_00001d92066153]  E_00001d92066153
