In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
import Levenshtein
import difflib

import sys
from pathlib import Path
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
target = 'point_of_interest'
num_split = 7
feat_columns = ['name', 'address', 'city', 'state', 'zip',
               'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'country',
              'address', 'state', 'url']
ROOT = Path.cwd().parent
INPUT = ROOT/'input'
DATA = INPUT/'foursquare-location-matching'
TDATA = INPUT/'fsfulltrain'
WORK = ROOT/'working'

In [3]:
def recall_knn(df, Neighbors = 10):
    #Start knn grouped by country
    print('group')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop=True)
        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors=neighbors,
                                  metric='haversine',
                                  n_jobs=-1)
        knn.fit(country_df[['latitude', 'longitude']], country_df.index)
        
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], return_distance=True)
        for k in range(neighbors):
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:,k]]
            cur_df['kdist_country'] = dists[:,k]
            cur_df['kneighbors_country'] = k
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    #start knn
    print('knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors=Neighbors)
    knn.fit(df[['latitude', 'longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude', 'longitude']])
    
    for k in range(Neighbors):
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:,k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                              on=['id', 'match_id'],
                              how='outer')
    del train_df_country
    return train_df

In [4]:
def add_features(df):
    for col in tqdm(feat_columns):
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]
            df[f'{col}_sim'] = np.array(tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis=1)).ravel()
        col_values = data.loc[df['id']][col].values.astype(str)
        match_values = data.loc[df['match_id']][col].values.astype(str)
        geshs = []
        levens = []
        jaros = []
        
        for s, match_s in zip(col_values, match_values):
            if s != 'nan' and match_s != 'nan':
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, match_values))
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis=1)
            
            df = df.drop(f'{col}_len', axis=1)
            df = df.drop(f'match_{col}_len', axis=1)
            gc.collect()
    return df

In [7]:
data = pd.read_csv(DATA/'train.csv')

id2index_d = dict(zip(data['id'].values, data.index))
tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

train_data = recall_knn(data, n_neighbors)
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()
poi = data.loc[ids][target].values
match_poi = data.loc[match_ids][target].values
train_data['label'] = np.array(poi==match_poi, dtype=np.int8)
del poi, match_poi, ids, match_ids
gc.collect


group


100%|██████████| 221/221 [04:24<00:00,  1.20s/it]


knn


<function gc.collect(generation=2)>

In [8]:
count = 0
start_row = 0
#data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split
for k in range(1, num_split+1):
    print('curent split %s' %k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row: end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row:]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    cur_data = add_features(cur_data)
    cur_data['kdist_diff'] = (cur_data['kdist'] - cur_data['kdist_country']) / cur_data['kdist_country']
    cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis=1)
    cur_data.to_csv(WORK/f'train_data{k}.csv', index=False)
    start_row = end_row
    count += len(cur_data)
    del cur_data
    gc.collect()
print(count)

curent split 1


100%|██████████| 9/9 [07:06<00:00, 47.44s/it]


curent split 2


100%|██████████| 9/9 [07:08<00:00, 47.60s/it]


curent split 3


100%|██████████| 9/9 [07:15<00:00, 48.35s/it]


curent split 4


100%|██████████| 9/9 [07:18<00:00, 48.74s/it]


curent split 5


100%|██████████| 9/9 [07:25<00:00, 49.45s/it]


curent split 6


100%|██████████| 9/9 [07:23<00:00, 49.27s/it]


curent split 7


100%|██████████| 9/9 [07:18<00:00, 48.73s/it]


13533862
