In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
from scipy.spatial.distance import cdist

In [3]:
with open('offices.json', encoding='utf-8') as f:
    offices = json.load(f)

In [4]:
kremlin_lat = 55.752
kremlin_lon = 37.618
degree_to_meter = 111319
moscow_rad = 70000 / degree_to_meter
office_latitudes = []
office_longitudes = []
max_people = []
opening_times = [[] for _ in range(7)]
closing_times = [[] for _ in range(7)]
business_morning = [[] for _ in range(7)]
business_day = [[] for _ in range(7)]
business_evening = [[] for _ in range(7)]

In [5]:
ord_to_day = {0: 'mn', 1: 'ts', 2: 'wd', 3: 'th', 4: 'fr', 5: 'st', 6: 'sn'}
day_to_ord = {'пн': 0, 'вт': 1, 'ср': 2, 'чт': 3, 'пт': 4, 'сб': 5, 'вс': 6}
ans = []
for obj in offices:
    if  'Не' in obj['openHours'][0]['days']:
        continue

    if len(obj['openHours']) == 7:
        ans.append((obj['openHours'] + [float(obj['latitude']), float(obj['longitude'])]).copy())
        continue

    new_dict_list = list()
    for l in obj['openHours']:
        if l["days"] == 'перерыв':
            for i in range(len(new_dict_list)):
                new_dict_list[i]['hours'] = new_dict_list[i]['hours'][:5] + '-' + l['hours'][:5] + ', ' + l['hours'][6:] + '-' + new_dict_list[i]['hours'][6:]
        elif len(l["days"]) > 2:
            s_front = l["days"][:2]
            s_back = l["days"][3:]

            for i in range(day_to_ord[s_front], day_to_ord[s_back] + 1):
                if i + 1 <= len(new_dict_list):
                    continue
                new_dict_list.append({'days': ord_to_day[i], "hours": l["hours"]})
        else:
            new_dict_list.append({"days": l["days"], "hours": l["hours"]})
    ans.append((new_dict_list + [float(obj['latitude']), float(obj['longitude'])]).copy())

In [6]:
ans

[[{'days': 'пн', 'hours': '09:00-18:00'},
  {'days': 'вт', 'hours': '09:00-18:00'},
  {'days': 'ср', 'hours': '09:00-18:00'},
  {'days': 'чт', 'hours': '09:00-18:00'},
  {'days': 'пт', 'hours': '09:00-17:00'},
  {'days': 'сб', 'hours': 'выходной'},
  {'days': 'вс', 'hours': 'выходной'},
  56.184479,
  36.984314],
 [{'days': 'mn', 'hours': '09:00-18:00'},
  {'days': 'ts', 'hours': '09:00-18:00'},
  {'days': 'wd', 'hours': '09:00-18:00'},
  {'days': 'th', 'hours': '09:00-18:00'},
  {'days': 'пт', 'hours': '09:00-17:00'},
  {'days': 'st', 'hours': 'выходной'},
  {'days': 'sn', 'hours': 'выходной'},
  56.183239,
  36.9757],
 [{'days': 'пн', 'hours': '10:00-19:00'},
  {'days': 'вт', 'hours': '10:00-19:00'},
  {'days': 'ср', 'hours': '10:00-19:00'},
  {'days': 'чт', 'hours': '10:00-19:00'},
  {'days': 'пт', 'hours': '10:00-18:00'},
  {'days': 'сб', 'hours': 'выходной'},
  {'days': 'вс', 'hours': 'выходной'},
  56.012386,
  37.482059],
 [{'days': 'пн', 'hours': '09:00-18:00'},
  {'days': 'вт'

In [7]:
for elem in ans:
    office_latitudes.append(elem[7])
    office_longitudes.append(elem[8])
    max_people.append(np.random.randint(1000, 3000))
    for i in range(7):
        opening_times[i].append(elem[i]['hours'][:5] if elem[i]['hours'] != 'выходной' else None)
        closing_times[i].append(elem[i]['hours'][-5:] if elem[i]['hours'] != 'выходной' else None)
        business_morning[i].append(round(np.random.uniform(0.01, 0.5), 2))
        business_day[i].append(round(np.random.uniform(0.01, 1), 2))
        business_evening[i].append(round(np.random.uniform(0.01, 0.5), 2))

In [8]:
len(office_latitudes)

160

In [9]:
len(office_longitudes)

160

In [10]:
len(max_people)

160

In [11]:
for i in range(7):
    print(len(opening_times[i]))
    print(len(closing_times[i]))
    print(len(business_morning[i]))
    print(len(business_day[i]))
    print(len(business_evening[i]))

160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160
160


In [12]:
db = pd.DataFrame()
db['Latitude'] = pd.Series(office_latitudes)

In [13]:
db

Unnamed: 0,Latitude
0,56.184479
1,56.183239
2,56.012386
3,56.010849
4,56.008335
...,...
155,55.478329
156,55.438740
157,55.432919
158,55.427883


In [14]:
db['Longitude'] = pd.Series(office_longitudes)
db['Max_People'] = pd.Series(max_people)
for i in range(7):
    db[f'opening_{ord_to_day[i]}'] = pd.Series(opening_times[i]).str[:2].fillna(0).astype(int)
    db[f'closing_{ord_to_day[i]}'] = pd.Series(closing_times[i]).str[:2].fillna(0).astype(int)
    db[f'morning_{ord_to_day[i]}'] = pd.Series(business_morning[i])
    db[f'day_{ord_to_day[i]}'] = pd.Series(business_day[i])
    db[f'evening_{ord_to_day[i]}'] = pd.Series(business_evening[i])

In [15]:
db

Unnamed: 0,Latitude,Longitude,Max_People,opening_mn,closing_mn,morning_mn,day_mn,evening_mn,opening_ts,closing_ts,...,opening_st,closing_st,morning_st,day_st,evening_st,opening_sn,closing_sn,morning_sn,day_sn,evening_sn
0,56.184479,36.984314,2100,9,18,0.35,0.66,0.29,9,18,...,0,0,0.34,0.15,0.30,0,0,0.18,0.69,0.11
1,56.183239,36.975700,2486,9,18,0.03,0.55,0.45,9,18,...,0,0,0.39,0.49,0.48,0,0,0.40,0.87,0.08
2,56.012386,37.482059,2780,10,19,0.27,0.54,0.23,10,19,...,0,0,0.04,0.59,0.26,0,0,0.35,0.13,0.10
3,56.010849,37.854359,1221,9,18,0.09,0.70,0.07,9,18,...,0,0,0.26,0.61,0.39,0,0,0.17,0.32,0.20
4,56.008335,37.851467,2758,9,18,0.31,0.29,0.29,9,18,...,0,0,0.29,0.89,0.38,0,0,0.20,0.03,0.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,55.478329,37.298706,2195,9,18,0.17,0.21,0.41,9,18,...,0,0,0.25,0.32,0.33,0,0,0.32,0.54,0.11
156,55.438740,37.767536,2470,9,18,0.22,0.09,0.49,9,18,...,0,0,0.04,0.49,0.09,0,0,0.36,0.26,0.50
157,55.432919,37.550838,1699,9,18,0.21,0.58,0.46,9,18,...,0,0,0.21,0.59,0.43,0,0,0.39,0.61,0.35
158,55.427883,37.550110,2829,9,18,0.41,0.53,0.34,9,18,...,0,0,0.21,0.09,0.07,0,0,0.41,0.89,0.29


In [16]:
examples_quantity = 8192
examples = pd.DataFrame()
usr_lat = []
usr_lon = []
days = [np.random.randint(0, 5) for i in range(examples_quantity)]
time = [np.random.uniform(9, 19) for i in range(examples_quantity)]
for i in range(examples_quantity):
    rho = np.random.uniform(0, moscow_rad)
    phi = np.random.uniform(0, 2 * np.pi)
    usr_lat.append(kremlin_lat + rho * np.sin(phi))
    usr_lon.append(kremlin_lon + rho * np.cos(phi))

In [17]:
examples['Latitude'] = usr_lat
examples['Longitude'] = usr_lon
examples['day'] = days
examples['time'] = time

In [18]:
examples

Unnamed: 0,Latitude,Longitude,day,time
0,55.838372,37.224573,1,18.179969
1,55.935546,37.045121,1,9.029633
2,55.241027,37.674616,3,18.694062
3,56.148145,37.715554,3,14.783217
4,55.547784,37.560169,1,9.027440
...,...,...,...,...
8187,55.664508,37.681980,4,16.385746
8188,55.775239,37.773463,0,9.673384
8189,56.305175,37.418889,2,16.512731
8190,55.644610,37.459987,3,9.627474


In [19]:
db

Unnamed: 0,Latitude,Longitude,Max_People,opening_mn,closing_mn,morning_mn,day_mn,evening_mn,opening_ts,closing_ts,...,opening_st,closing_st,morning_st,day_st,evening_st,opening_sn,closing_sn,morning_sn,day_sn,evening_sn
0,56.184479,36.984314,2100,9,18,0.35,0.66,0.29,9,18,...,0,0,0.34,0.15,0.30,0,0,0.18,0.69,0.11
1,56.183239,36.975700,2486,9,18,0.03,0.55,0.45,9,18,...,0,0,0.39,0.49,0.48,0,0,0.40,0.87,0.08
2,56.012386,37.482059,2780,10,19,0.27,0.54,0.23,10,19,...,0,0,0.04,0.59,0.26,0,0,0.35,0.13,0.10
3,56.010849,37.854359,1221,9,18,0.09,0.70,0.07,9,18,...,0,0,0.26,0.61,0.39,0,0,0.17,0.32,0.20
4,56.008335,37.851467,2758,9,18,0.31,0.29,0.29,9,18,...,0,0,0.29,0.89,0.38,0,0,0.20,0.03,0.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,55.478329,37.298706,2195,9,18,0.17,0.21,0.41,9,18,...,0,0,0.25,0.32,0.33,0,0,0.32,0.54,0.11
156,55.438740,37.767536,2470,9,18,0.22,0.09,0.49,9,18,...,0,0,0.04,0.49,0.09,0,0,0.36,0.26,0.50
157,55.432919,37.550838,1699,9,18,0.21,0.58,0.46,9,18,...,0,0,0.21,0.59,0.43,0,0,0.39,0.61,0.35
158,55.427883,37.550110,2829,9,18,0.41,0.53,0.34,9,18,...,0,0,0.21,0.09,0.07,0,0,0.41,0.89,0.29


In [20]:
points1 = db.loc[:, ['Latitude', 'Longitude']].values

In [21]:
points2 = examples.loc[:, ['Latitude', 'Longitude']].values

In [22]:
distances = cdist(points1, points2)
distances_df = pd.DataFrame(distances, index=db.index, columns=['dist' + str(idx) for idx in examples.index])
distances_df

Unnamed: 0,dist0,dist1,dist2,dist3,dist4,dist5,dist6,dist7,dist8,dist9,...,dist8182,dist8183,dist8184,dist8185,dist8186,dist8187,dist8188,dist8189,dist8190,dist8191
0,0.421325,0.256253,1.169024,0.732143,0.858481,0.760244,0.764299,1.287981,0.748095,1.308752,...,0.639103,0.709147,0.820574,0.777645,0.527278,0.870120,0.888951,0.451024,0.719530,0.992454
1,0.425289,0.257238,1.173135,0.740686,0.863370,0.766835,0.770739,1.292405,0.756108,1.314737,...,0.645165,0.717694,0.827139,0.784274,0.533239,0.876307,0.896041,0.459657,0.724331,0.999096
2,0.310774,0.443643,0.795030,0.270094,0.471123,0.278217,0.291037,0.888919,0.226089,0.830724,...,0.220895,0.251929,0.334587,0.291629,0.181690,0.401233,0.375706,0.299526,0.368438,0.494335
3,0.652977,0.812734,0.790527,0.195235,0.548614,0.331694,0.350648,0.835954,0.148330,0.659264,...,0.426636,0.213761,0.328857,0.323448,0.486126,0.386868,0.249111,0.525607,0.538201,0.356812
4,0.649526,0.809624,0.787425,0.194985,0.544942,0.327876,0.346838,0.833221,0.145158,0.657506,...,0.422863,0.213251,0.325164,0.319651,0.482577,0.383331,0.245802,0.524631,0.534371,0.354071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.367595,0.522831,0.444545,0.788933,0.270531,0.438322,0.420074,0.573874,0.656329,0.718188,...,0.366202,0.779240,0.454343,0.448236,0.386899,0.426101,0.559955,0.835535,0.231648,0.554258
156,0.674177,0.876755,0.218459,0.711307,0.234289,0.363782,0.349783,0.261923,0.556556,0.250760,...,0.449916,0.715841,0.313193,0.352601,0.557590,0.241435,0.336551,0.933951,0.370093,0.223819
157,0.520424,0.713010,0.228349,0.733948,0.115243,0.347503,0.327468,0.348122,0.580560,0.462140,...,0.364060,0.731611,0.327457,0.346807,0.448393,0.266142,0.408343,0.882180,0.230362,0.352213
158,0.523904,0.716055,0.224537,0.739019,0.120322,0.352580,0.332548,0.344946,0.585607,0.462203,...,0.368899,0.736692,0.332407,0.351860,0.452865,0.270889,0.412968,0.887052,0.234718,0.355974


In [23]:
examples['day'].values

array([1, 1, 3, ..., 2, 3, 0], dtype=int64)

In [24]:
diff_matrix = [[0] * 8192 for _ in range(160)]
days = examples['day'].values
coming = examples['time'].values
for i in range(160):
    for j in range(8192):
        closing_hours = db[f'closing_{ord_to_day[days[j]]}'].values
        diff_matrix[i][j] = 1 / max(closing_hours[i] - coming[j], 0.0001)

In [25]:
diff_df = pd.DataFrame(diff_matrix, index=db.index, columns=['diff' + str(idx) for idx in examples.index])

In [26]:
diff_df

Unnamed: 0,diff0,diff1,diff2,diff3,diff4,diff5,diff6,diff7,diff8,diff9,...,diff8182,diff8183,diff8184,diff8185,diff8186,diff8187,diff8188,diff8189,diff8190,diff8191
0,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
1,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
2,1.219466,0.100297,3.268637,0.237148,0.100275,0.100676,0.318362,0.118806,0.618673,0.106766,...,0.122627,0.118575,0.576924,0.146001,0.120601,0.619481,0.107220,0.402047,0.106695,0.639611
3,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
4,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
156,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
157,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779
158,10000.000000,0.111478,10000.000000,0.310870,0.111451,0.111946,0.467055,0.134825,1.622422,0.119528,...,0.139766,0.134526,1.363643,0.170962,0.137141,1.627990,0.120097,0.672373,0.119438,1.774779


In [27]:
busy_matrix = [[0] * 8192 for _ in range(160)]
days = examples['day'].values
time = examples['time'].values
for i in range(160):
    for j in range(8192):
        if time[j] < 12:
            business = db[f'morning_{ord_to_day[days[j]]}'].values
        elif time[j] > 15:
            business = db[f'evening_{ord_to_day[days[j]]}'].values
        else:
            business = db[f'day_{ord_to_day[days[j]]}'].values
        busy_matrix[i][j] = business[i]


In [28]:
busy_df = pd.DataFrame(busy_matrix, index=db.index, columns=['busy' + str(idx) for idx in examples.index])

In [29]:
busy_df

Unnamed: 0,busy0,busy1,busy2,busy3,busy4,busy5,busy6,busy7,busy8,busy9,...,busy8182,busy8183,busy8184,busy8185,busy8186,busy8187,busy8188,busy8189,busy8190,busy8191
0,0.33,0.26,0.37,0.18,0.26,0.26,0.12,0.26,0.12,0.39,...,0.35,0.46,0.12,0.09,0.39,0.34,0.35,0.12,0.42,0.29
1,0.37,0.49,0.15,0.98,0.49,0.49,0.37,0.49,0.37,0.25,...,0.03,0.47,0.37,0.38,0.25,0.15,0.03,0.37,0.39,0.45
2,0.20,0.20,0.18,0.33,0.20,0.20,0.13,0.20,0.13,0.18,...,0.27,0.49,0.13,0.42,0.18,0.43,0.27,0.13,0.48,0.23
3,0.49,0.13,0.34,0.99,0.13,0.13,0.24,0.13,0.24,0.45,...,0.09,0.35,0.24,0.39,0.45,0.45,0.09,0.24,0.13,0.07
4,0.50,0.23,0.36,0.89,0.23,0.23,0.06,0.23,0.06,0.14,...,0.31,0.31,0.06,0.07,0.14,0.13,0.31,0.06,0.27,0.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.22,0.43,0.05,0.25,0.43,0.43,0.15,0.43,0.15,0.39,...,0.17,0.27,0.15,0.54,0.39,0.31,0.17,0.15,0.39,0.41
156,0.24,0.45,0.29,0.13,0.45,0.45,0.15,0.45,0.15,0.18,...,0.22,0.26,0.15,0.93,0.18,0.21,0.22,0.15,0.14,0.49
157,0.33,0.04,0.44,0.28,0.04,0.04,0.39,0.04,0.39,0.48,...,0.21,0.32,0.39,0.54,0.48,0.22,0.21,0.39,0.34,0.46
158,0.47,0.10,0.13,0.74,0.10,0.10,0.16,0.10,0.16,0.24,...,0.41,0.15,0.16,0.96,0.24,0.02,0.41,0.16,0.18,0.34


In [30]:
vect = []
for i in range(8192):
    dist = distances_df.loc[:, f'dist{i}']
    diff = diff_df.loc[:, f'diff{i}']
    busy = busy_df.loc[:, f'busy{i}']
    result = pd.concat([dist, diff, busy], axis=1, join="outer")
    result = result.sort_values(by=[f'dist{i}', f'diff{i}', f'busy{i}'])
    vect.append(list(result.loc[[result.index[0], result.index[-1]]].values))

In [31]:
vect

[[array([7.45034189e-02, 1.00000000e+04, 2.00000000e-02]),
  array([1.11683015e+00, 1.00000000e+04, 3.00000000e-02])],
 [array([0.10747083, 0.11147816, 0.19      ]),
  array([1.31979386, 0.11147816, 0.49      ])],
 [array([2.18459417e-01, 1.00000000e+04, 2.90000000e-01]),
  array([1.17313515e+00, 1.00000000e+04, 1.50000000e-01])],
 [array([0.19498464, 0.31086963, 0.89      ]),
  array([0.95661487, 0.31086963, 0.87      ])],
 [array([0.01691453, 0.11145092, 0.5       ]),
  array([0.86337013, 0.11145092, 0.49      ])],
 [array([0.00252045, 0.11194641, 0.15      ]),
  array([0.77700564, 0.11194641, 0.17      ])],
 [array([0.00770393, 0.31836212, 0.37      ]),
  array([0.77321105, 0.46705463, 0.41      ])],
 [array([0.26192259, 0.1348245 , 0.45      ]),
  array([1.29240455, 0.1348245 , 0.49      ])],
 [array([0.08869827, 1.62242237, 0.45      ]),
  array([0.89203606, 1.62242237, 0.39      ])],
 [array([0.24816518, 0.11952777, 0.18      ]),
  array([1.31473736, 0.11952777, 0.25      ])],
 [

In [40]:
X = np.reshape(np.array(vect), (-1, 3))

[[7.45034189e-02 1.00000000e+04 2.00000000e-02]
 [1.11683015e+00 1.00000000e+04 3.00000000e-02]
 [1.07470826e-01 1.11478160e-01 1.90000000e-01]
 [1.31979386e+00 1.11478160e-01 4.90000000e-01]
 [2.18459417e-01 1.00000000e+04 2.90000000e-01]
 [1.17313515e+00 1.00000000e+04 1.50000000e-01]
 [1.94984642e-01 3.10869630e-01 8.90000000e-01]
 [9.56614866e-01 3.10869630e-01 8.70000000e-01]]


In [33]:
y = np.reshape(np.array([[0, 1] for _ in range(8192)]), (8192*2))

In [41]:
y = np.reshape(y, 8192*2)
print(y[:8])

[0 1 0 1 0 1 0 1]


In [35]:
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:100], y[:100], train_size=0.75, test_size=.25, random_state=0)

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  # integer valued parameter
        'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=16,
    cv=3,
    verbose=16,
    n_jobs=16
)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
