In [1]:
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import os
import multiprocessing

cores_available = multiprocessing.cpu_count()

In [2]:
import pyproj
import lightgbm as lgb

from scipy import spatial

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from fuzzywuzzy import fuzz

In [3]:
def name_fuzz_ratio_comp (df_obj, name_col, indices_col):
    name_array = df_obj[name_col].to_numpy()
    return df_obj[['name','indices']].apply(
        lambda row: [fuzz.ratio(row['name'], index_name) 
                     for index_name 
                     in name_array[row['indices']]],
        axis=1)

In [4]:
def dataset_conversion(df: pd.DataFrame, transformer: pyproj.transformer.Transformer):
    df['name'].fillna(value='',inplace=True)
    
    geocentr_cartesian = df[[ 'latitude', 'longitude']].apply(lambda x: transformer.transform( x.longitude, x.latitude, 0, radians=False), axis=1).to_numpy()
    tree = spatial.KDTree(geocentr_cartesian.tolist())
    loc_data = tree.query(geocentr_cartesian.tolist(),min(len(df),10), workers=max(cores_available-1, 1))    
    distances, indices = loc_data
    
    df [["distances", "indices"]]=pd.DataFrame(
        data = {
         "distances": distances.tolist(),
         "indices": indices.tolist()
        }, 
        columns = ["distances", "indices"]
    )
    
    df['name_comp_fuzz_score'] = name_fuzz_ratio_comp(df, 'name', 'indices')
    
    x_train_set = np.apply_along_axis(
        lambda x: list(
            zip(
                np.repeat(
                    np.arange(x[0], x[0] + 1 ), len(x[1])
                ),
                x[1],
                x[2],
                x[3]
            )
        ),
        1, 
        df[['indices', 'distances', 'name_comp_fuzz_score']].reset_index().to_numpy()
    ).reshape((-1,4)).astype(int)
    
    try:
        y_train_set = (df['point_of_interest'].to_numpy()[x_train_set[:,0]]==df['point_of_interest'].to_numpy()[x_train_set[:,1]]).astype(int)
    except:
        y_train_set = None
    
    return x_train_set, y_train_set

In [5]:
transformer = pyproj.Transformer.from_crs(
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        )

In [6]:
dataframe_train_set = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')

In [7]:
dataframe_test_set = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')

In [8]:
x_train_set, y_train = dataset_conversion(dataframe_train_set, transformer)

In [9]:
x_test_set, _ = dataset_conversion(dataframe_test_set, transformer)

In [10]:
X_train, X_test = x_train_set[:,2:], x_test_set[:,2:]

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
# pd.read_csv('kaggle/input/foursquare-location-matching/pairs.csv')
# pd.read_csv('kaggle/input/foursquare-location-matching/test.csv')
# pd.read_csv('/kaggle/input/foursquare-location-matching/sample_submission.csv')
# pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')


In [13]:
d_train = lgb.Dataset(X_train, label=y_train)

In [14]:
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 40
params['min_data'] = 50
params['max_depth'] = 10

In [15]:
clf = lgb.train(params, d_train, 1000)

[LightGBM] [Info] Number of positive: 1802314, number of negative: 9585806
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 11388120, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.158263 -> initscore=-1.671212
[LightGBM] [Info] Start training from score -1.671212


In [16]:
y_pred=clf.predict(X_train)
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [17]:
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [18]:
x_train_set[np.concatenate(([y_pred ==1], [x_train_set[:,0]!=x_train_set[:,1]]),axis=0, ).all(axis=0)]

array([[      1, 1032766,      28,     100],
       [     10,  937422,       7,     100],
       [     10,  498845,      11,     100],
       ...,
       [1138809, 1113074,       0,      97],
       [1138809,  789114,       0,      51],
       [1138809,   69015,       0,      43]])

In [19]:
df_matches = pd.DataFrame(data=x_train_set[np.concatenate(([y_pred ==1], [x_train_set[:,0]!=x_train_set[:,1]]),axis=0, ).all(axis=0)][:,:2], columns=['indx', 'corresp'])

In [20]:
if df_matches.empty:
    pass
import networkx as nx
G = nx.Graph()
G.add_edges_from(df_matches.values)

In [34]:
try:
    ' '.join(nx.node_connected_component(G, 1))
except KeyError:
    ' '.join([])

TypeError: sequence item 0: expected str instance, int found

In [29]:
for node in nx.node_connected_component(G, 1):
    print(node)

1
1032766


In [24]:
for i in nx.connected_components(G):
    print (i)

{1, 1032766}
{10, 388179, 498845, 937422}
{16, 779092}
{641163, 21}
{33, 647849}
{51809, 36}
{444424, 45}
{930440, 46}
{20916, 53}
{881576, 58}
{111418, 68}
{74, 1040278}
{760601, 82}
{1095123, 84}
{85, 135055}
{981250, 86}
{89, 746324}
{184121, 90}
{1097707, 310638, 460786, 690611, 873685, 1024021, 92}
{341661, 119}
{120, 484231}
{755208, 127}
{1041600, 132}
{145, 128991}
{147, 936387}
{881432, 736585, 148}
{152, 675462}
{50488, 156}
{158, 24759}
{160, 384236}
{172, 806149}
{177, 651801}
{184, 604253}
{185, 501597}
{936008, 674384, 189}
{993938, 191}
{194, 1120567}
{1121546, 196}
{1126851, 436708, 593834, 465069, 648882, 167826, 212}
{659593, 214}
{218, 403623}
{820952, 1103952, 223}
{227, 375463}
{156430, 239}
{48385, 561988, 627590, 1079304, 413417, 715657, 127784, 673161, 258091, 820238, 758647, 1110193, 761815, 247, 1046905, 220314, 1120412, 941373}
{249, 361685}
{1065610, 1138507, 44564, 252, 856445}
{822993, 259}
{589830, 622598, 958470, 483347, 1089562, 122918, 65578, 360492, 5

In [None]:
df_matches.values

In [None]:
y_pred=clf.predict(X_test)
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [None]:
x_test_set[np.concatenate(([y_pred ==1], [x_test_set[:,0]!=x_test_set[:,1]]),axis=0, ).all(axis=0)]

In [None]:
pd.DataFrame(data=x_test_set[np.concatenate(([y_pred ==1], [x_test_set[:,0]!=x_test_set[:,1]]),axis=0, ).all(axis=0)][:,:2], columns=['indx', 'corresp'])

In [None]:
import networkx as nx


In [None]:
G = nx.Graph()

In [None]:
x_test_set[y_pred ==1][:,:2]