In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
from scipy.spatial.distance import cdist

In [3]:
with open('offices.json', encoding='utf-8') as f:
    offices = json.load(f)

In [4]:
kremlin_lat = 55.752
kremlin_lon = 37.618
degree_to_meter = 111319
moscow_rad = 70000 / degree_to_meter
office_latitudes = []
office_longitudes = []
max_people = []
opening_times = [[] for _ in range(7)]
closing_times = [[] for _ in range(7)]
business_morning = [[] for _ in range(7)]
business_day = [[] for _ in range(7)]
business_evening = [[] for _ in range(7)]

In [5]:
ord_to_day = {0: 'mn', 1: 'ts', 2: 'wd', 3: 'th', 4: 'fr', 5: 'st', 6: 'sn'}
day_to_ord = {'пн': 0, 'вт': 1, 'ср': 2, 'чт': 3, 'пт': 4, 'сб': 5, 'вс': 6}
ans = []
for obj in offices:
    if  'Не' in obj['openHours'][0]['days']:
        continue

    if len(obj['openHours']) == 7:
        ans.append((obj['openHours'] + [float(obj['latitude']), float(obj['longitude'])]).copy())
        continue

    new_dict_list = list()
    for l in obj['openHours']:
        if l["days"] == 'перерыв':
            for i in range(len(new_dict_list)):
                new_dict_list[i]['hours'] = new_dict_list[i]['hours'][:5] + '-' + l['hours'][:5] + ', ' + l['hours'][6:] + '-' + new_dict_list[i]['hours'][6:]
        elif len(l["days"]) > 2:
            s_front = l["days"][:2]
            s_back = l["days"][3:]

            for i in range(day_to_ord[s_front], day_to_ord[s_back] + 1):
                if i + 1 <= len(new_dict_list):
                    continue
                new_dict_list.append({'days': ord_to_day[i], "hours": l["hours"]})
        else:
            new_dict_list.append({"days": l["days"], "hours": l["hours"]})
    ans.append((new_dict_list + [float(obj['latitude']), float(obj['longitude'])]).copy())

In [6]:
ans

[[{'days': 'пн', 'hours': '09:00-18:00'},
  {'days': 'вт', 'hours': '09:00-18:00'},
  {'days': 'ср', 'hours': '09:00-18:00'},
  {'days': 'чт', 'hours': '09:00-18:00'},
  {'days': 'пт', 'hours': '09:00-17:00'},
  {'days': 'сб', 'hours': 'выходной'},
  {'days': 'вс', 'hours': 'выходной'},
  56.184479,
  36.984314],
 [{'days': 'mn', 'hours': '09:00-18:00'},
  {'days': 'ts', 'hours': '09:00-18:00'},
  {'days': 'wd', 'hours': '09:00-18:00'},
  {'days': 'th', 'hours': '09:00-18:00'},
  {'days': 'пт', 'hours': '09:00-17:00'},
  {'days': 'st', 'hours': 'выходной'},
  {'days': 'sn', 'hours': 'выходной'},
  56.183239,
  36.9757],
 [{'days': 'пн', 'hours': '10:00-19:00'},
  {'days': 'вт', 'hours': '10:00-19:00'},
  {'days': 'ср', 'hours': '10:00-19:00'},
  {'days': 'чт', 'hours': '10:00-19:00'},
  {'days': 'пт', 'hours': '10:00-18:00'},
  {'days': 'сб', 'hours': 'выходной'},
  {'days': 'вс', 'hours': 'выходной'},
  56.012386,
  37.482059],
 [{'days': 'пн', 'hours': '09:00-18:00'},
  {'days': 'вт'

In [7]:
for elem in ans:
    office_latitudes.append(elem[7])
    office_longitudes.append(elem[8])
    max_people.append(np.random.randint(1000, 3000))
    for i in range(7):
        opening_times[i].append(elem[i]['hours'][:5] if elem[i]['hours'] != 'выходной' else None)
        closing_times[i].append(elem[i]['hours'][-5:] if elem[i]['hours'] != 'выходной' else None)
        business_morning[i].append(round(np.random.uniform(0.01, 0.5), 2))
        business_day[i].append(round(np.random.uniform(0.01, 1), 2))
        business_evening[i].append(round(np.random.uniform(0.01, 0.5), 2))

In [8]:
len(office_latitudes)

160

In [9]:
len(office_longitudes)

160

In [10]:
len(max_people)

160

In [11]:
for i in range(7):
    print(len(opening_times[i]))
    print(len(closing_times[i]))
    print(len(business_morning[i]))
    print(len(business_day[i]))
    print(len(business_evening[i]))

160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160


In [12]:
db = pd.DataFrame()
db['Latitude'] = pd.Series(office_latitudes)

In [13]:
db

Unnamed: 0,Latitude
0,56.184479
1,56.183239
2,56.012386
3,56.010849
4,56.008335
...,...
155,55.478329
156,55.438740
157,55.432919
158,55.427883


In [14]:
db['Longitude'] = pd.Series(office_longitudes)
db['Max_People'] = pd.Series(max_people)
for i in range(7):
    db[f'opening_{ord_to_day[i]}'] = pd.Series(opening_times[i]).str[:2].fillna(0).astype(int)
    db[f'closing_{ord_to_day[i]}'] = pd.Series(closing_times[i]).str[:2].fillna(0).astype(int)
    db[f'morning_{ord_to_day[i]}'] = pd.Series(business_morning[i])
    db[f'day_{ord_to_day[i]}'] = pd.Series(business_day[i])
    db[f'evening_{ord_to_day[i]}'] = pd.Series(business_evening[i])

In [15]:
db

Unnamed: 0,Latitude,Longitude,Max_People,opening_mn,closing_mn,morning_mn,day_mn,evening_mn,opening_ts,closing_ts,...,opening_st,closing_st,morning_st,day_st,evening_st,opening_sn,closing_sn,morning_sn,day_sn,evening_sn
0,56.184479,36.984314,1119,9,18,0.50,0.11,0.15,9,18,...,0,0,0.29,0.54,0.38,0,0,0.37,0.63,0.04
1,56.183239,36.975700,1570,9,18,0.27,0.78,0.04,9,18,...,0,0,0.35,0.42,0.37,0,0,0.42,0.40,0.36
2,56.012386,37.482059,1361,10,19,0.42,0.72,0.47,10,19,...,0,0,0.17,0.52,0.16,0,0,0.06,0.19,0.34
3,56.010849,37.854359,1730,9,18,0.11,0.65,0.38,9,18,...,0,0,0.09,0.04,0.04,0,0,0.34,0.03,0.14
4,56.008335,37.851467,2500,9,18,0.26,0.23,0.45,9,18,...,0,0,0.24,0.92,0.20,0,0,0.49,0.13,0.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,55.478329,37.298706,1416,9,18,0.38,0.67,0.32,9,18,...,0,0,0.48,0.17,0.21,0,0,0.25,0.48,0.07
156,55.438740,37.767536,2273,9,18,0.21,0.80,0.18,9,18,...,0,0,0.34,0.53,0.17,0,0,0.12,0.64,0.16
157,55.432919,37.550838,2847,9,18,0.32,0.86,0.08,9,18,...,0,0,0.33,0.21,0.44,0,0,0.48,0.41,0.12
158,55.427883,37.550110,1081,9,18,0.07,0.76,0.48,9,18,...,0,0,0.21,0.05,0.43,0,0,0.45,0.10,0.16


In [16]:
examples_quantity = 8192
examples = pd.DataFrame()
usr_lat = []
usr_lon = []
days = [np.random.randint(0, 5) for i in range(examples_quantity)]
time = [np.random.uniform(9, 19) for i in range(examples_quantity)]
for i in range(examples_quantity):
    rho = np.random.uniform(0, moscow_rad)
    phi = np.random.uniform(0, 2 * np.pi)
    usr_lat.append(kremlin_lat + rho * np.sin(phi))
    usr_lon.append(kremlin_lon + rho * np.cos(phi))

In [17]:
examples['Latitude'] = usr_lat
examples['Longitude'] = usr_lon
examples['day'] = days
examples['time'] = time

In [18]:
examples

Unnamed: 0,Latitude,Longitude,day,time
0,55.278084,37.545769,4,18.360393
1,55.964956,37.437843,4,12.927843
2,55.545711,37.637862,0,11.504942
3,56.218027,37.999053,1,13.414385
4,56.309958,37.411617,1,9.926999
...,...,...,...,...
8187,55.196264,37.878805,3,9.964708
8188,56.178225,37.528902,1,12.723628
8189,56.012880,38.069750,0,9.305767
8190,55.799446,37.621591,2,18.161057


In [19]:
db

Unnamed: 0,Latitude,Longitude,Max_People,opening_mn,closing_mn,morning_mn,day_mn,evening_mn,opening_ts,closing_ts,...,opening_st,closing_st,morning_st,day_st,evening_st,opening_sn,closing_sn,morning_sn,day_sn,evening_sn
0,56.184479,36.984314,1119,9,18,0.50,0.11,0.15,9,18,...,0,0,0.29,0.54,0.38,0,0,0.37,0.63,0.04
1,56.183239,36.975700,1570,9,18,0.27,0.78,0.04,9,18,...,0,0,0.35,0.42,0.37,0,0,0.42,0.40,0.36
2,56.012386,37.482059,1361,10,19,0.42,0.72,0.47,10,19,...,0,0,0.17,0.52,0.16,0,0,0.06,0.19,0.34
3,56.010849,37.854359,1730,9,18,0.11,0.65,0.38,9,18,...,0,0,0.09,0.04,0.04,0,0,0.34,0.03,0.14
4,56.008335,37.851467,2500,9,18,0.26,0.23,0.45,9,18,...,0,0,0.24,0.92,0.20,0,0,0.49,0.13,0.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,55.478329,37.298706,1416,9,18,0.38,0.67,0.32,9,18,...,0,0,0.48,0.17,0.21,0,0,0.25,0.48,0.07
156,55.438740,37.767536,2273,9,18,0.21,0.80,0.18,9,18,...,0,0,0.34,0.53,0.17,0,0,0.12,0.64,0.16
157,55.432919,37.550838,2847,9,18,0.32,0.86,0.08,9,18,...,0,0,0.33,0.21,0.44,0,0,0.48,0.41,0.12
158,55.427883,37.550110,1081,9,18,0.07,0.76,0.48,9,18,...,0,0,0.21,0.05,0.43,0,0,0.45,0.10,0.16


In [20]:
points1 = db.loc[:, ['Latitude', 'Longitude']].values

In [21]:
points2 = examples.loc[:, ['Latitude', 'Longitude']].values

In [22]:
distances = cdist(points1, points2)
distances_df = pd.DataFrame(distances, index=db.index, columns=['dist' + str(idx) for idx in examples.index])
distances_df

Unnamed: 0,dist0,dist1,dist2,dist3,dist4,dist5,dist6,dist7,dist8,dist9,...,dist8182,dist8183,dist8184,dist8185,dist8186,dist8187,dist8188,dist8189,dist8190,dist8191
0,1.066200,0.503864,0.913865,1.015293,0.445345,0.758696,0.764851,0.450607,0.976429,0.183994,...,0.657463,0.850623,0.592302,1.230349,0.845541,1.332922,0.544624,1.098917,0.744562,1.088069
1,1.069712,0.511100,0.919184,1.023944,0.453961,0.765128,0.771256,0.457494,0.979957,0.190513,...,0.662863,0.854556,0.594393,1.233966,0.853115,1.337803,0.553224,1.107234,0.751314,1.090418
2,0.737060,0.064843,0.491995,0.556391,0.305796,0.286663,0.293783,0.118834,0.657343,0.352313,...,0.283504,0.526574,0.469977,0.880277,0.320090,0.907448,0.172328,0.587692,0.254583,0.827019
3,0.795093,0.419037,0.513054,0.252704,0.534310,0.352636,0.355248,0.481788,0.745244,0.721278,...,0.484899,0.650734,0.752318,0.887846,0.159072,0.814951,0.365974,0.215401,0.314439,0.930526
4,0.791655,0.415893,0.509556,0.256422,0.533334,0.348821,0.351439,0.478654,0.741658,0.718650,...,0.481086,0.646998,0.748556,0.884728,0.155422,0.812531,0.364570,0.218331,0.310608,0.926955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.318022,0.506127,0.345784,1.018646,0.839259,0.417596,0.415639,0.488054,0.235856,0.624197,...,0.298332,0.171190,0.182728,0.480038,0.617108,0.645039,0.736780,0.938219,0.455380,0.315149
156,0.273845,0.620968,0.168102,0.812951,0.941116,0.354094,0.346973,0.652673,0.280324,0.900445,...,0.424007,0.301530,0.597215,0.315198,0.436482,0.266787,0.777036,0.648823,0.389113,0.428346
157,0.154918,0.543904,0.142461,0.904042,0.888020,0.328657,0.322903,0.555896,0.092840,0.769277,...,0.310703,0.096913,0.401673,0.300948,0.494171,0.404435,0.745629,0.778219,0.373294,0.277744
158,0.149862,0.548681,0.146915,0.908778,0.892881,0.333741,0.327984,0.560449,0.088256,0.773135,...,0.315308,0.099167,0.403593,0.296415,0.499006,0.402104,0.750642,0.782463,0.378376,0.272960


In [23]:
examples['day'].values

array([4, 4, 0, ..., 0, 2, 1], dtype=int64)

In [24]:
diff_matrix = [[0] * 8192 for _ in range(160)]
days = examples['day'].values
coming = examples['time'].values
for i in range(160):
    for j in range(8192):
        closing_hours = db[f'closing_{ord_to_day[days[j]]}'].values
        diff_matrix[i][j] = 1 / max(closing_hours[i] - coming[j], 0.0001)

In [25]:
diff_df = pd.DataFrame(diff_matrix, index=db.index, columns=['diff' + str(idx) for idx in examples.index])

In [26]:
diff_df

Unnamed: 0,diff0,diff1,diff2,diff3,diff4,diff5,diff6,diff7,diff8,diff9,...,diff8182,diff8183,diff8184,diff8185,diff8186,diff8187,diff8188,diff8189,diff8190,diff8191
0,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
1,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
2,10000.0,0.197155,0.133421,0.179031,0.110217,0.316966,0.140696,0.644639,3.48606,0.739619,...,0.105194,29.645424,0.235648,0.281367,1.417606,0.110677,0.159328,0.103154,1.191976,0.773238
3,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
4,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
156,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
157,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908
158,10000.0,0.245570,0.153963,0.218073,0.123870,0.464057,0.163732,1.814041,10000.00000,2.840532,...,0.117560,10000.000000,0.308298,0.391531,10000.000000,0.124451,0.189524,0.115019,10000.000000,3.409908


In [27]:
busy_matrix = [[0] * 8192 for _ in range(160)]
days = examples['day'].values
time = examples['time'].values
for i in range(160):
    for j in range(8192):
        if time[j] < 12:
            business = db[f'morning_{ord_to_day[days[j]]}'].values
        elif time[j] > 15:
            business = db[f'evening_{ord_to_day[days[j]]}'].values
        else:
            business = db[f'day_{ord_to_day[days[j]]}'].values
        busy_matrix[i][j] = business[i]


In [28]:
busy_df = pd.DataFrame(busy_matrix, index=db.index, columns=['busy' + str(idx) for idx in examples.index])

In [29]:
busy_df

Unnamed: 0,busy0,busy1,busy2,busy3,busy4,busy5,busy6,busy7,busy8,busy9,...,busy8182,busy8183,busy8184,busy8185,busy8186,busy8187,busy8188,busy8189,busy8190,busy8191
0,0.42,0.96,0.50,0.33,0.04,0.02,0.50,0.15,0.42,0.21,...,0.18,0.15,0.33,0.15,0.21,0.18,0.33,0.50,0.48,0.02
1,0.05,0.43,0.27,0.58,0.46,0.26,0.27,0.04,0.05,0.30,...,0.43,0.04,0.58,0.04,0.30,0.43,0.58,0.27,0.10,0.26
2,0.38,0.11,0.42,0.32,0.07,0.05,0.42,0.47,0.38,0.36,...,0.49,0.47,0.32,0.47,0.36,0.49,0.32,0.42,0.28,0.05
3,0.43,0.69,0.11,0.03,0.06,0.48,0.11,0.38,0.43,0.08,...,0.24,0.38,0.03,0.38,0.08,0.24,0.03,0.11,0.49,0.48
4,0.04,0.84,0.26,0.23,0.03,0.11,0.26,0.45,0.04,0.17,...,0.26,0.45,0.23,0.45,0.17,0.26,0.23,0.26,0.28,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.25,0.32,0.38,0.33,0.09,0.10,0.38,0.32,0.25,0.27,...,0.36,0.32,0.33,0.32,0.27,0.36,0.33,0.38,0.34,0.10
156,0.48,0.49,0.21,0.35,0.38,0.28,0.21,0.18,0.48,0.10,...,0.43,0.18,0.35,0.18,0.10,0.43,0.35,0.21,0.08,0.28
157,0.10,0.83,0.32,0.54,0.08,0.24,0.32,0.08,0.10,0.05,...,0.08,0.08,0.54,0.08,0.05,0.08,0.54,0.32,0.13,0.24
158,0.49,0.27,0.07,0.06,0.30,0.31,0.07,0.48,0.49,0.17,...,0.37,0.48,0.06,0.48,0.17,0.37,0.06,0.07,0.17,0.31


In [30]:
vect = []
for i in range(8192):
    dist = distances_df.loc[:, f'dist{i}']
    diff = diff_df.loc[:, f'diff{i}']
    busy = busy_df.loc[:, f'busy{i}']
    result = pd.concat([dist, diff, busy], axis=1, join="outer")
    result = result.sort_values(by=[f'dist{i}', f'diff{i}', f'busy{i}'])
    vect.append(list(result.loc[[result.index[0], result.index[-1]]].values))

In [31]:
vect

[[array([1.49861636e-01, 1.00000000e+04, 4.90000000e-01]),
  array([1.06971187e+00, 1.00000000e+04, 5.00000000e-02])],
 [array([0.06388294, 0.19715477, 0.22      ]),
  array([0.98619526, 0.24557008, 0.88      ])],
 [array([0.06342037, 0.13342125, 0.16      ]),
  array([0.91918404, 0.15396322, 0.27      ])],
 [array([0.25270365, 0.21807328, 0.03      ]),
  array([1.24383143, 0.21807328, 0.09      ])],
 [array([0.30579591, 0.11021712, 0.07      ]),
  array([1.22910806, 0.12386968, 0.16      ])],
 [array([0.00424897, 0.3169663 , 0.2       ]),
  array([0.76759969, 0.46405661, 0.4       ])],
 [array([0.00927111, 0.16373208, 0.1       ]),
  array([0.77193203, 0.16373208, 0.49      ])],
 [array([0.06741139, 0.64463918, 0.32      ]),
  array([1.03659999, 1.81404121, 0.19      ])],
 [array([8.82561565e-02, 1.00000000e+04, 4.90000000e-01]),
  array([9.79957268e-01, 1.00000000e+04, 5.00000000e-02])],
 [array([0.10469454, 2.8405315 , 0.01      ]),
  array([1.30345149, 2.8405315 , 0.16      ])],
 [

In [32]:
X = np.reshape(np.array(vect), (-1, 3))

In [33]:
y = np.reshape(np.array([[0, 1] for _ in range(8192)]), (8192*2))

In [36]:
y = np.reshape(y, 8192*2)

In [37]:
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=.25, random_state=0)

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  # integer valued parameter
        'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))