In [1]:
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import os
import multiprocessing

cores_available = multiprocessing.cpu_count()

In [2]:
import pyproj
import lightgbm as lgb

from scipy import spatial

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from fuzzywuzzy import fuzz

In [3]:
def name_fuzz_ratio_comp (df_obj, name_col, indices_col):
    name_array = df_obj[name_col].to_numpy()
    return df_obj[['name','indices']].apply(
        lambda row: [fuzz.ratio(row['name'], index_name) 
                     for index_name 
                     in name_array[row['indices']]],
        axis=1)

In [4]:
def dataset_conversion(df: pd.DataFrame, transformer: pyproj.transformer.Transformer):
    df['name'].fillna(value='',inplace=True)
    
    geocentr_cartesian = df[[ 'latitude', 'longitude']].apply(lambda x: transformer.transform( x.longitude, x.latitude, 0, radians=False), axis=1).to_numpy()
    tree = spatial.KDTree(geocentr_cartesian.tolist())
    loc_data = tree.query(geocentr_cartesian.tolist(),min(len(df),10), workers=max(cores_available-1, 1))    
    distances, indices = loc_data
    
    df [["distances", "indices"]]=pd.DataFrame(
        data = {
         "distances": distances.tolist(),
         "indices": indices.tolist()
        }, 
        columns = ["distances", "indices"]
    )
    
    df['name_comp_fuzz_score'] = name_fuzz_ratio_comp(df, 'name', 'indices')
    
    x_train_set = np.apply_along_axis(
        lambda x: list(
            zip(
                np.repeat(
                    np.arange(x[0], x[0] + 1 ), len(x[1])
                ),
                x[1],
                x[2],
                x[3]
            )
        ),
        1, 
        df[['indices', 'distances', 'name_comp_fuzz_score']].reset_index().to_numpy()
    ).reshape((-1,4)).astype(int)
    
    try:
        y_train_set = (df['point_of_interest'].to_numpy()[x_train_set[:,0]]==df['point_of_interest'].to_numpy()[x_train_set[:,1]]).astype(int)
    except:
        y_train_set = None
    
    return x_train_set, y_train_set

In [5]:
transformer = pyproj.Transformer.from_crs(
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        )

In [6]:
dataframe_train_set = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')

In [7]:
dataframe_test_set = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')

In [8]:
x_train_set, y_train = dataset_conversion(dataframe_train_set, transformer)

In [9]:
x_test_set, _ = dataset_conversion(dataframe_test_set, transformer)

In [10]:
X_train, X_test = x_train_set[:,2:], x_test_set[:,2:]

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
# pd.read_csv('kaggle/input/foursquare-location-matching/pairs.csv')
# pd.read_csv('kaggle/input/foursquare-location-matching/test.csv')
# pd.read_csv('/kaggle/input/foursquare-location-matching/sample_submission.csv')
# pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')


In [13]:
d_train = lgb.Dataset(X_train, label=y_train)

In [14]:
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 40
params['min_data'] = 50
params['max_depth'] = 10

In [15]:
clf = lgb.train(params, d_train, 1000)

[LightGBM] [Info] Number of positive: 1802314, number of negative: 9585806
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 11388120, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.158263 -> initscore=-1.671212
[LightGBM] [Info] Start training from score -1.671212


In [16]:
# y_pred=clf.predict(X_train)
# y_pred[y_pred >= 0.5] = 1
# y_pred[y_pred < 0.5] = 0

In [17]:
# y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [18]:
# x_train_set[np.concatenate(([y_pred ==1], [x_train_set[:,0]!=x_train_set[:,1]]),axis=0, ).all(axis=0)]

array([[      1, 1032766,      28,     100],
       [     10,  937422,       7,     100],
       [     10,  498845,      11,     100],
       ...,
       [1138809, 1113074,       0,      97],
       [1138809,  789114,       0,      51],
       [1138809,   69015,       0,      43]])

In [19]:
# df_matches = pd.DataFrame(data=x_train_set[np.concatenate(([y_pred ==1], [x_train_set[:,0]!=x_train_set[:,1]]),axis=0, ).all(axis=0)][:,:2], columns=['indx', 'corresp'])

In [20]:
# if df_matches.empty:
#     pass
# import networkx as nx
# G = nx.Graph()
# G.add_edges_from(df_matches.values)

In [21]:
# def set_to_id_string(node_index):
#     try:
#         id_string = ' '.join([dataframe_train_set.loc[value,'id'] for value in nx.node_connected_component(G, node_index)])
#     except KeyError:
#         id_string = dataframe_train_set.loc[node_index,'id']
#     return id_string

In [22]:
# dataframe_train_set.loc[1,:].name

1

In [23]:
# dataframe_train_set['matches'] = dataframe_train_set.apply(lambda row: set_to_id_string(row.name), axis=1)

In [24]:
# dataframe_train_set[['id','matches']]

Unnamed: 0,id,matches
0,E_000001272c6c5d,E_000001272c6c5d
1,E_000002eae2a589,E_000002eae2a589 E_e80db432029aea
2,E_000007f24ebc95,E_000007f24ebc95
3,E_000008a8ba4f48,E_000008a8ba4f48
4,E_00001d92066153,E_00001d92066153
...,...,...
1138807,E_ffffb80854f713,E_ffffb80854f713
1138808,E_ffffbf9a83e0ba,E_ffffbf9a83e0ba E_37cbd58e31092a
1138809,E_ffffc572b4d35b,E_27bcc6f6dd33ed E_32e1fc89082ba5 E_fa389570ea...
1138810,E_ffffca745329ed,E_ffffca745329ed


In [25]:
# dataframe_train_set[['id','matches']].to_csv('submission.csv', index=False)

In [33]:
y_pred=clf.predict(X_test)
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [34]:
x_test_set[np.concatenate(([y_pred ==1], [x_test_set[:,0]!=x_test_set[:,1]]),axis=0, ).all(axis=0)]

array([], shape=(0, 4), dtype=int64)

In [35]:
pd.DataFrame(data=x_test_set[np.concatenate(([y_pred ==1], [x_test_set[:,0]!=x_test_set[:,1]]),axis=0, ).all(axis=0)][:,:2], columns=['indx', 'corresp'])

Unnamed: 0,indx,corresp


In [39]:
df_matches = pd.DataFrame(data=x_test_set[np.concatenate(([y_pred ==1], [x_test_set[:,0]!=x_test_set[:,1]]),axis=0, ).all(axis=0)][:,:2], columns=['indx', 'corresp'])

array([[       0,        0,        0,      100],
       [       0,        3,  1004977,       35],
       [       0,        4,  1005364,       38],
       [       0,        2,  7013604,        0],
       [       0,        1, 11766837,       34],
       [       1,        1,        0,      100],
       [       1,        2,  8001658,        0],
       [       1,        0, 11766837,       34],
       [       1,        3, 12068478,       23],
       [       1,        4, 12068751,       28],
       [       2,        2,        0,      100],
       [       2,        0,  7013604,        0],
       [       2,        3,  7813276,        0],
       [       2,        4,  7813427,        0],
       [       2,        1,  8001658,        0],
       [       3,        3,        0,      100],
       [       3,        4,     1008,       50],
       [       3,        0,  1004977,       35],
       [       3,        2,  7813276,        0],
       [       3,        1, 12068478,       23],
       [       4,   

In [40]:
import networkx as nx

if df_matches.empty:
    dataframe_test_set['matches'] = dataframe_test_set['id']
else:
    G = nx.Graph()
    G.add_edges_from(df_matches.values)
    def set_to_id_string(node_index):
        try:
            id_string = ' '.join([dataframe_test_set.loc[value,'id'] for value in nx.node_connected_component(G, node_index)])
        except KeyError:
            id_string = dataframe_test_set.loc[node_index,'id']
        return id_string
    
    dataframe_test_set['matches'] = dataframe_test_set.apply(lambda row: set_to_id_string(row.name), axis=1)

dataframe_test_set[['id','matches']].to_csv('submission.csv', index=False)

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,distances,indices,name_comp_fuzz_score,matches
0,E_00001118ad0191,Jamu Petani Bagan Serai,5.012169,100.535805,,,,,MY,,,Cafés,"[0.0, 1004977.7798870908, 1005364.00915764, 70...","[0, 3, 4, 2, 1]","[100, 35, 38, 0, 34]",E_00001118ad0191
1,E_000020eb6fed40,Johnny's Bar,40.434209,-80.56416,497 N 12th St,Weirton,WV,26062.0,US,,,Bars,"[0.0, 8001658.48887562, 11766837.27857592, 120...","[1, 2, 0, 3, 4]","[100, 0, 34, 23, 28]",E_000020eb6fed40
2,E_00002f98667edf,QIWI,47.215134,39.686088,"Межевая улица, 60",Ростов-на-Дону,,,RU,https://qiwi.com,78003010000.0,ATMs,"[0.0, 7013604.005830041, 7813276.19697226, 781...","[2, 0, 3, 4, 1]","[100, 0, 0, 0, 0]",E_00002f98667edf
3,E_001b6bad66eb98,"Gelora Sriwijaya, Jaka Baring Sport City",-3.014675,104.794374,,,,,ID,,,Stadiums,"[0.0, 1008.0736645860571, 1004977.7798870908, ...","[3, 4, 0, 2, 1]","[100, 50, 35, 0, 23]",E_001b6bad66eb98
4,E_0283d9f61e569d,Stadion Gelora Sriwijaya,-3.021727,104.788628,Jalan Gubernur Hasan Bastari,Palembang,South Sumatra,11480.0,ID,,,Soccer Stadiums,"[0.0, 1008.0736645860571, 1005364.00915764, 78...","[4, 3, 0, 2, 1]","[100, 50, 38, 0, 28]",E_0283d9f61e569d
