In [1]:
import sys

sys.executable.split('/')[-3]

'mobility_venv'

In [2]:
import datetime
import json
import os
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm

warnings.filterwarnings('ignore')
tqdm.pandas()
pd.options.display.max_columns = None

## Load data

In [3]:
def save_object(obj, name, out_dir='out'):
    file_name = name if name.endswith('.json') else (name + '.json')
    file_path = os.path.join(out_dir, file_name)
    with open(file_path, 'w') as f:
        json.dump(obj, f)

def load_object(name, in_dir='out'):
    file_name = name if name.endswith('.json') else (name + '.json')
    file_path = os.path.join(in_dir, file_name)
    with open(file_path, 'r') as f:
        return json.load(f)

In [4]:
# OSM facilities
data_dir = '../../data/external/osm'
facilities_file = 'facilities.csv'

facilities_df = pd.read_csv(os.path.join(data_dir, facilities_file))

# KBR regions
data_dir = '../../data/interim'
regions_file = 'regions_info.csv'

regions_df = pd.read_csv(os.path.join(data_dir, regions_file), index_col=0)

# tags to 'other' subcategories map
data_dir = '../../data/processed/mc/travel_planning/other_travels_split'
tags_map_file = 'tags.json'

tags_map = load_object(name=tags_map_file, in_dir=data_dir)
other_travels_map = tags_map['other']

# gravity distribution
data_dir = '../../data/processed/mc/travel_planning/old'
gravity_file = 'gravity_dist.json'

gravity_dist = load_object(name=gravity_file, in_dir=data_dir)

# distance between regions
data_dir = '../../data/processed/kr'
distance_between_regions_file = 'distance_between_regions.json'
distance_between_regions = load_object(name=distance_between_regions_file, in_dir=data_dir)

### Facilities dataframe

In [5]:
print(facilities_df.info())
facilities_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69467 entries, 0 to 69466
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         69467 non-null  object 
 1   category   69467 non-null  object 
 2   tag        69467 non-null  object 
 3   name       7131 non-null   object 
 4   region_id  69467 non-null  int64  
 5   x          69467 non-null  float64
 6   y          69467 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 3.7+ MB
None


Unnamed: 0,id,category,tag,name,region_id,x,y
0,shop_265644030,shop,greengrocer,,48,6430831.0,5664721.0
1,shop_265644193,shop,supermarket,Rabat,47,6430721.0,5665215.0
2,shop_265644198,shop,kiosk,,47,6430865.0,5665051.0
3,shop_265644220,shop,convenience,Przystanek,47,6431049.0,5664961.0
4,shop_266010556,shop,kiosk,,117,6429292.0,5666063.0


In [6]:
facilities_df['tag'].unique()

array(['greengrocer', 'supermarket', 'kiosk', 'convenience', 'computer',
       'furniture', 'gift', 'alcohol', 'books', 'pet', 'outdoor', 'shoes',
       'electronics', 'hairdresser', 'bicycle', 'optician', 'bakery',
       'copyshop', 'clothes', 'money_lender', 'car', 'florist', 'pastry',
       'toys', 'beauty', 'garden_centre', 'car_repair', 'chemist',
       'musical_instrument', 'doityourself', 'curtain', 'tyres',
       'bookmaker', 'trade', 'car_parts', 'lighting', 'hardware',
       'jewelry', 'hearing_aids', 'butcher', 'paint', 'variety_store',
       'health_food', 'newsagent', 'pawnbroker', 'stationery',
       'bathroom_furnishing', 'travel_agency', 'sports', 'motorcycle',
       'radiotechnics', 'seafood', 'photo', 'games', 'houseware',
       'herbalist', 'perfumery', 'photo_studio', 'nutrition_supplements',
       'deli', 'funeral_directors', 'appliance', 'fishing',
       'mobile_phone', 'model', 'wholesale', 'honey', 'dry_cleaning',
       'baby_goods', 'sewing', 'cof

### Regions dataframe

In [7]:
print(regions_df.info())
regions_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375 entries, 0 to 374
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Regions                375 non-null    int64 
 1   Regions_agr            375 non-null    object
 2   Region_name            375 non-null    object
 3   Objects_offices        375 non-null    int64 
 4   Objects_shops          375 non-null    int64 
 5   Objects_apartments     375 non-null    int64 
 6   Objects_industry       375 non-null    int64 
 7   Objects_schools        375 non-null    int64 
 8   Objects_universitites  375 non-null    int64 
 9   Poluation_UM           375 non-null    int64 
 10  Population_6plus_UM    375 non-null    int64 
 11  Population_BD          375 non-null    int64 
 12  Population_AUT         375 non-null    int64 
 13  Population_MET1        375 non-null    int64 
 14  Population_MET2        375 non-null    int64 
 15  Jobs_GUS               

Unnamed: 0,Regions,Regions_agr,Region_name,Objects_offices,Objects_shops,Objects_apartments,Objects_industry,Objects_schools,Objects_universitites,Poluation_UM,Population_6plus_UM,Population_BD,Population_AUT,Population_MET1,Population_MET2,Jobs_GUS,Jobs_AUT,Jobs_MET3,Jobs_MET4
0,1,A10,Rynek,106538,58760,131606,288,0,0,1695,1645,5785,1698,2714,2105,10354,6986,9875,2262
1,2,A9,UWr,12954,4082,101166,3824,244,5911,1719,1657,565,1710,226,339,4798,609,1026,182
2,3,A9,Hala Targowa,9500,12060,62071,271,434,1412,1301,1253,1348,1391,548,713,4751,1139,2349,730
3,4,A11,Pl. Dominikański,70156,104991,64536,188,0,0,650,620,704,726,269,191,4250,722,1566,217
4,5,A11,Skargi,13656,10927,60003,286,1785,0,1072,1034,696,1067,400,348,5194,965,1305,321


### Other travels map keys

In [8]:
other_subcats = list(other_travels_map.keys())
other_subcats

['gastronomy',
 'culture_and_entertainment',
 'adults_entertainment',
 'sport',
 'official_matters',
 'other',
 'grocery_shopping',
 'other_shopping',
 'pharmacy',
 'healthcare',
 'services',
 'leisure_time_schools',
 'religion']

### Gravity keys

In [9]:
gravity_dist.keys()

dict_keys(['dom', 'praca', 'szkola', 'uczelnia', 'inne'])

### Distance between regions keys

In [10]:
distance_between_regions.keys()

dict_keys(['22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157'

### Prepare other subcats to gravity

In [11]:
def add_other_subcat(row):
    subcat = np.NaN
    for other_subcat, tags in other_travels_map.items():
        if row['tag'] in tags:
            subcat = other_subcat
    return subcat

facilities_df['other_subcat'] = facilities_df.apply(add_other_subcat, axis=1)

In [12]:
facilities_df['other_subcat'].value_counts(dropna=False)

NaN                          62606
other_shopping                1852
gastronomy                    1401
grocery_shopping              1300
services                       900
healthcare                     316
official_matters               302
pharmacy                       215
adults_entertainment           142
other                          132
culture_and_entertainment      129
religion                       117
leisure_time_schools            38
sport                           17
Name: other_subcat, dtype: int64

In [13]:
for other_subcat in other_subcats:
    subcat_tags = facilities_df[ 
        facilities_df['other_subcat'] == other_subcat
    ]['tag'].unique()
    print('__'*40)
    print(f'{other_subcat}:\n\t{subcat_tags}')

________________________________________________________________________________
gastronomy:
	['fast_food' 'restaurant' 'cafe' 'ice_cream' 'bar' 'bbq']
________________________________________________________________________________
culture_and_entertainment:
	['library' 'cinema' 'theatre' 'arts_centre' 'community_centre'
 'events_venue' 'music_venue' 'riding_hall' 'concert_hall']
________________________________________________________________________________
adults_entertainment:
	['pub' 'nightclub' 'biergarten']
________________________________________________________________________________
sport:
	['gymnasium' 'sports_hall' 'sports_centre' 'stadium']
________________________________________________________________________________
official_matters:
	['police' 'bank' 'social_facility' 'townhall' 'government' 'public']
________________________________________________________________________________
other:
	['trade' 'vehicle_inspection' 'research_institute' 'bus_station'
 'social_cent

In [14]:
grouped_facilities_df = facilities_df.groupby(by=['other_subcat', 'region_id']).size().reset_index(name='count')

grouped_facilities_df.head()

Unnamed: 0,other_subcat,region_id,count
0,adults_entertainment,1,15
1,adults_entertainment,2,4
2,adults_entertainment,3,3
3,adults_entertainment,4,4
4,adults_entertainment,6,13


In [15]:
pivot_facilities_df = grouped_facilities_df.pivot(index='region_id', columns='other_subcat', values='count').reset_index()
# pivot_facilities_df.reset_index(inplace=True, drop=True)
# pivot_facilities_df.index.name = None
pivot_facilities_df.fillna(0, inplace=True)

pivot_facilities_df.head()

other_subcat,region_id,adults_entertainment,culture_and_entertainment,gastronomy,grocery_shopping,healthcare,leisure_time_schools,official_matters,other,other_shopping,pharmacy,religion,services,sport
0,1,15.0,3.0,97.0,14.0,4.0,2.0,11.0,1.0,36.0,3.0,2.0,11.0,0.0
1,2,4.0,2.0,51.0,4.0,0.0,1.0,2.0,0.0,16.0,0.0,3.0,4.0,0.0
2,3,3.0,0.0,13.0,12.0,0.0,0.0,1.0,0.0,10.0,2.0,3.0,4.0,1.0
3,4,4.0,2.0,26.0,15.0,8.0,0.0,10.0,0.0,52.0,2.0,1.0,8.0,0.0
4,5,0.0,2.0,7.0,1.0,3.0,0.0,2.0,0.0,7.0,0.0,1.0,5.0,1.0


## Prepare new gravity (including other subcats)

In [17]:
leave_old = list(gravity_dist.keys())
leave_old.remove('inne')

regions = regions_df['Regions'].to_list()

# list(set(regions) - set(pivot_facilities_df['region_id'].unique()))

In [19]:
# int(pivot_facilities_df[
#     pivot_facilities_df['region_id'] == 404
# ]['adults_entertainment'])

In [20]:
new_gravity_dist = {}
fill_0_distance_with = 150

# ['dom', 'praca', 'szkola', 'uczelnia']
for destination in leave_old:
    new_gravity_dist[destination] = gravity_dist[destination]

# other subcats
for destination in other_subcats:
    new_gravity_dist[destination] = {}

    for from_region in tqdm(regions):
        new_gravity_dist[destination][from_region] = {}

        for to_region in regions:
            try:
                count = int(pivot_facilities_df[
                    pivot_facilities_df['region_id'] == to_region
                ][destination])
            except TypeError:
                count = 0
            # if type(columns_for_dest[destination]) == str:
            #     value_j = gdf[gdf['Region']==to_region].iloc[0][columns_for_dest[destination]]
            # else:
            #     value_j = columns_for_dest[destination]

            distance = distance_between_regions[str(from_region)][str(to_region)]
            if distance == 0:
                distance = fill_0_distance_with

            new_gravity_dist[destination][from_region][to_region] = count / distance

        # normalization
        s = np.sum(np.array(list(new_gravity_dist[destination][from_region].values())))
        for to_region, count in new_gravity_dist[destination][from_region].items():
            new_gravity_dist[destination][from_region][to_region] = count / s

100%|██████████| 375/375 [01:08<00:00,  5.45it/s]
100%|██████████| 375/375 [01:04<00:00,  5.80it/s]
100%|██████████| 375/375 [01:06<00:00,  5.61it/s]
100%|██████████| 375/375 [00:59<00:00,  6.29it/s]
100%|██████████| 375/375 [00:59<00:00,  6.31it/s]
100%|██████████| 375/375 [00:59<00:00,  6.30it/s]
100%|██████████| 375/375 [00:59<00:00,  6.30it/s]
100%|██████████| 375/375 [00:59<00:00,  6.30it/s]
100%|██████████| 375/375 [00:59<00:00,  6.28it/s]
100%|██████████| 375/375 [00:59<00:00,  6.31it/s]
100%|██████████| 375/375 [00:59<00:00,  6.30it/s]
100%|██████████| 375/375 [00:59<00:00,  6.29it/s]
100%|██████████| 375/375 [00:59<00:00,  6.31it/s]


In [22]:
destination = 'adults_entertainment'
from_region = 10
to_region = 1

print(new_gravity_dist[destination][from_region][to_region])

0.10018998707259459


In [23]:
out_dir = '../../data/processed/mc/travel_planning'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [25]:
save_object(
    obj=new_gravity_dist,
    name='gravity_dist', 
    out_dir=out_dir
)