In [26]:
import pandas as pd
import geopandas as gpd
import contextily as ctx
from shapely.geometry import Point, LineString
from tqdm import tqdm
import json

tqdm.pandas()

# Rozkład prawd. podjęcia podróży

In [27]:
data = pd.read_excel('../../data/raw/Ankiety.xlsx', "Wrocław_ankiety+podróże", engine='openpyxl', header=[0,1])
data.columns = [f'{i}_{j}' for i, j in data.columns]
df = data[['IDENTYFIKACJA ANKIETY_Nr rejonu komunikacyjnego', 'OPIS PODRÓŻY "ŹRÓDŁO"_Nr rejonu', 'OPIS PODRÓŻY "CEL"_Nr rejonu']]
df = df.iloc[0:14241]
df.columns = ['id_region', 'source', 'destination']

In [28]:
df.head()

Unnamed: 0,id_region,source,destination
0,67.0,67.0,68.0
1,67.0,68.0,67.0
2,67.0,,
3,67.0,,
4,67.0,,


In [29]:
def add_travel(row):
    try:
        if row['source'] >= 0:
            row['travel'] = True
        else:
            row['travel'] = False
    except Exception as e:
        print(row)

    return row

df = df.fillna(-1)
df = df.apply(add_travel, axis=1)

In [30]:
df.head(10)

Unnamed: 0,id_region,source,destination,travel
0,67.0,67.0,68.0,1.0
1,67.0,68.0,67.0,1.0
2,67.0,-1.0,-1.0,0.0
3,67.0,-1.0,-1.0,0.0
4,67.0,-1.0,-1.0,0.0
5,67.0,67.0,68.0,1.0
6,67.0,68.0,67.0,1.0
7,67.0,67.0,29.0,1.0
8,67.0,29.0,67.0,1.0
9,67.0,67.0,28.0,1.0


In [31]:
grouped_df = df.groupby(['id_region', 'travel']).count().reset_index()

In [32]:
grouped_df.head()

Unnamed: 0,id_region,travel,source,destination
0,1.0,0.0,8,8
1,1.0,1.0,19,19
2,2.0,0.0,1,1
3,2.0,1.0,55,55
4,3.0,0.0,3,3


In [33]:
travel_distributions = {}

for id_region in tqdm(df['id_region'].unique()):

    sub_df = grouped_df[(grouped_df['id_region'] == id_region) & (grouped_df['travel'] == 0)]
    if len(sub_df) > 0:
        f = sub_df.iloc[0]['destination']
    else:
        f = 0

    sub_df = grouped_df[(grouped_df['id_region'] == id_region) & (grouped_df['travel'] == 1)]
    if len(sub_df) > 0:
        t = sub_df.iloc[0]['destination']
    else:
        t = 0

    travel_distributions[int(id_region)] = {
        "true": t,
        "false": f
    }

travel_distributions

100%|██████████| 338/338 [00:00<00:00, 439.42it/s]


{67: {'true': 52.0, 'false': 6.0},
 69: {'true': 40.0, 'false': 1.0},
 53: {'true': 20.0, 'false': 20.0},
 59: {'true': 39.0, 'false': 2.0},
 135: {'true': 56.0, 'false': 24.0},
 33: {'true': 55.0, 'false': 7.0},
 38: {'true': 96.0, 'false': 7.0},
 62: {'true': 78.0, 'false': 2.0},
 61: {'true': 117.0, 'false': 3.0},
 48: {'true': 74.0, 'false': 7.0},
 121: {'true': 70.0, 'false': 3.0},
 117: {'true': 47.0, 'false': 8.0},
 36: {'true': 136.0, 'false': 7.0},
 57: {'true': 80.0, 'false': 9.0},
 85: {'true': 34.0, 'false': 19.0},
 35: {'true': 160.0, 'false': 14.0},
 34: {'true': 91.0, 'false': 10.0},
 46: {'true': 59.0, 'false': 4.0},
 22: {'true': 62.0, 'false': 3.0},
 15: {'true': 76.0, 'false': 4.0},
 12: {'true': 46.0, 'false': 3.0},
 32: {'true': 75.0, 'false': 3.0},
 16: {'true': 82.0, 'false': 5.0},
 81: {'true': 57.0, 'false': 4.0},
 19: {'true': 89.0, 'false': 5.0},
 20: {'true': 129.0, 'false': 4.0},
 55: {'true': 70.0, 'false': 46.0},
 333: {'true': 67.0, 'false': 8.0},
 84: {

# Rozkład prawd. liczby regionów

In [34]:
regions = gpd.read_file('../../data/raw/EtapII-REJONY_wroclaw.shp')
regions = regions.to_crs(epsg=3857)
regions.head()

Unnamed: 0,NUMBER,NAME,ZST_0_5,ZST_6_15,ZST_16_19,ZST_20_24,ZST_25_44,ZST_45_WE,ZST_WE_I_W,ZST_SUMA,...,ZCZ_25_44,ZCZ_45_WE,ZCZ_WE_I_W,ZCZ_SUMA,GUS_MI,GUS_MI_6_,BD_A_MI_6_,REGON_ZI,BD_A_ZI,geometry
0,22,Komandorska/Swobodna,111.0,200.0,68.0,82.0,670.0,606.0,1412.0,3149.0,...,32.0,11.0,3.0,64.0,3213,3102,3202,1255,1200,"POLYGON ((1896088.508 6638289.682, 1896090.449..."
1,23,Centrum PoÅudniowe,101.0,124.0,49.0,81.0,595.0,547.0,812.0,2309.0,...,55.0,17.0,3.0,95.0,2404,2303,2377,10258,4750,"POLYGON ((1895696.332 6638631.845, 1895609.850..."
2,24,Stysia,154.0,257.0,74.0,123.0,942.0,825.0,1257.0,3632.0,...,34.0,13.0,4.0,137.0,3769,3615,5118,1373,2296,"POLYGON ((1895195.706 6638966.861, 1895023.610..."
3,25,OstrÃ³w Tumski,20.0,26.0,10.0,18.0,219.0,220.0,227.0,740.0,...,22.0,10.0,14.0,54.0,794,774,799,413,1479,"POLYGON ((1897793.867 6641122.750, 1897605.051..."
4,26,Szczytnicka,221.0,320.0,139.0,274.0,1375.0,1349.0,1134.0,4812.0,...,49.0,13.0,3.0,94.0,4906,4685,4836,1468,1522,"POLYGON ((1898500.766 6641903.438, 1898632.875..."


In [35]:
regions_centroids = gpd.read_file('../../data/raw/EtapII-REJONY_wroclaw_centroidy.shp')
regions_centroids = regions_centroids.to_crs(epsg=3857)
regions_centroids.head()

Unnamed: 0,NUMBER,NAME,geometry
0,22,Komandorska/Swobodna,POINT (1895691.084 6638060.201)
1,23,Centrum Południowe,POINT (1895216.845 6638469.484)
2,24,Stysia,POINT (1894784.807 6638834.557)
3,25,Ostrów Tumski,POINT (1897613.196 6641635.311)
4,26,Szczytnicka,POINT (1898512.276 6641616.408)


In [36]:
def get_regions_num(source_id, target_id, plot=False):
    regions_num = 1

    if source_id != target_id:
        source_centroid = regions_centroids[regions_centroids['NUMBER'] == source_id].iloc[0]['geometry']
        target_centroid = regions_centroids[regions_centroids['NUMBER'] == target_id].iloc[0]['geometry']

        line = LineString([source_centroid, target_centroid])

        ids = []
        for i in range(len(regions)):
            if regions.iloc[i]['geometry'].intersects(line):
                ids.append(regions.iloc[i]['NUMBER'])
        
        regions_num = len(ids)

    if plot and regions_num > 1:
        ax = regions[regions['NUMBER'].isin(ids)].plot(figsize=(10,10), alpha=0.5, edgecolor='k')
        ctx.add_basemap(ax)
        ax.set_axis_off()

    return regions_num

In [37]:
get_regions_num(35, 35, True)

1

In [38]:
regions_set = set(regions['NUMBER'].unique())

def add_regions_number(row):
    if (row['source'] in regions_set) and (row['destination'] in regions_set):
        try:
            row['regions_num'] = get_regions_num(row['source'], row['destination'])
        except:
            print(row)
    else:
        row['regions_num'] = -1

    return row

df = df.progress_apply(add_regions_number, axis=1)

100%|██████████| 14241/14241 [10:01<00:00, 23.66it/s] 


In [39]:
df.head(10)

Unnamed: 0,id_region,source,destination,travel,regions_num
0,67.0,67.0,68.0,1.0,4.0
1,67.0,68.0,67.0,1.0,4.0
2,67.0,-1.0,-1.0,0.0,-1.0
3,67.0,-1.0,-1.0,0.0,-1.0
4,67.0,-1.0,-1.0,0.0,-1.0
5,67.0,67.0,68.0,1.0,4.0
6,67.0,68.0,67.0,1.0,4.0
7,67.0,67.0,29.0,1.0,7.0
8,67.0,29.0,67.0,1.0,7.0
9,67.0,67.0,28.0,1.0,6.0


In [40]:
grouped_df = df.groupby(['id_region', 'regions_num']).count().reset_index()

In [41]:
grouped_df.head(10)

Unnamed: 0,id_region,regions_num,source,destination,travel
0,1.0,-1.0,11,11,11
1,1.0,1.0,3,3,3
2,1.0,2.0,1,1,1
3,1.0,4.0,2,2,2
4,1.0,5.0,4,4,4
5,1.0,8.0,2,2,2
6,1.0,13.0,2,2,2
7,1.0,14.0,2,2,2
8,2.0,-1.0,1,1,1
9,2.0,1.0,9,9,9


In [42]:
grouped_df['regions_num'].max()

33.0

In [52]:
regions_num_distributions = {}

for id_region in tqdm(df['id_region'].unique()):

    regions_num_distributions[int(id_region)] = {}

    for num in range(1, int(grouped_df['regions_num'].max()) + 1):

        sub_df = grouped_df[(grouped_df['id_region'] == id_region) & (grouped_df['regions_num'] == num)]
        if len(sub_df) > 0:
            n = sub_df.iloc[0]['destination']
        else:
            n = 0

        regions_num_distributions[int(id_region)][int(num)] = n

regions_num_distributions

100%|██████████| 338/338 [00:09<00:00, 35.69it/s]


 8: 6.0,
  9: 10.0,
  10: 2.0,
  11: 0,
  12: 2.0,
  13: 4.0,
  14: 0,
  15: 0,
  16: 2.0,
  17: 0,
  18: 0,
  19: 0,
  20: 0,
  21: 0,
  22: 0,
  23: 0,
  24: 0,
  25: 0,
  26: 0,
  27: 0,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0},
 37: {1: 0,
  2: 2.0,
  3: 4.0,
  4: 0,
  5: 0,
  6: 2.0,
  7: 1.0,
  8: 2.0,
  9: 0,
  10: 4.0,
  11: 0,
  12: 0,
  13: 0,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 0,
  20: 0,
  21: 0,
  22: 0,
  23: 0,
  24: 0,
  25: 0,
  26: 0,
  27: 0,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0},
 90: {1: 5.0,
  2: 14.0,
  3: 15.0,
  4: 9.0,
  5: 3.0,
  6: 1.0,
  7: 6.0,
  8: 9.0,
  9: 18.0,
  10: 2.0,
  11: 5.0,
  12: 2.0,
  13: 2.0,
  14: 0,
  15: 2.0,
  16: 2.0,
  17: 0,
  18: 2.0,
  19: 0,
  20: 0,
  21: 0,
  22: 0,
  23: 0,
  24: 0,
  25: 0,
  26: 0,
  27: 0,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0},
 228: {1: 3.0,
  2: 10.0,
  3: 1.0,
  4: 4.0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 2.0,
  12: 0,
  13: 2

# Rozkład prawd. regionu startowego

In [44]:
regions_info = pd.read_csv('../../data/interim/regions_info.csv')[['Regions', 'Population_AUT']]

In [45]:
regions_info.head()

Unnamed: 0,Regions,Population_AUT
0,1,1698
1,2,1710
2,3,1391
3,4,726
4,5,1067


In [46]:
poulation_distribution = {}

for id_region in tqdm(regions_info['Regions'].unique()):

    sub_df = regions_info[regions_info['Regions'] == id_region]
    poulation_distribution[str(id_region)] = str(sub_df.iloc[0]['Population_AUT'])


poulation_distribution

100%|██████████| 375/375 [00:00<00:00, 1584.36it/s]


{'1': '1698',
 '2': '1710',
 '3': '1391',
 '4': '726',
 '5': '1067',
 '6': '1221',
 '7': '677',
 '8': '768',
 '9': '1330',
 '10': '718',
 '11': '1897',
 '12': '2433',
 '13': '1774',
 '14': '2367',
 '15': '3033',
 '16': '3942',
 '17': '323',
 '18': '158',
 '19': '4595',
 '20': '6027',
 '21': '1',
 '22': '3202',
 '23': '2377',
 '24': '5119',
 '25': '799',
 '26': '4836',
 '27': '2595',
 '28': '1284',
 '29': '1577',
 '30': '522',
 '31': '1052',
 '32': '3502',
 '33': '2895',
 '34': '4071',
 '35': '8553',
 '36': '7113',
 '37': '682',
 '38': '3888',
 '39': '6989',
 '40': '4292',
 '41': '434',
 '42': '1301',
 '43': '16',
 '44': '0',
 '45': '12',
 '46': '2425',
 '47': '5335',
 '48': '3649',
 '49': '2433',
 '50': '3955',
 '51': '157',
 '52': '146',
 '53': '2172',
 '54': '8961',
 '55': '5171',
 '56': '2547',
 '57': '4533',
 '58': '2179',
 '59': '2254',
 '60': '2020',
 '61': '3213',
 '62': '3402',
 '63': '0',
 '64': '78',
 '65': '12278',
 '66': '6346',
 '67': '2014',
 '68': '2239',
 '69': '1049',


# Save jsons

In [53]:
with open('travel_dist.json', 'w') as outfile:
    json.dump(travel_distributions, outfile)

In [54]:
with open('regions_num_dist.json', 'w') as outfile:
    json.dump(regions_num_distributions, outfile)

In [55]:
with open('population_dist.json', 'w') as outfile:
    json.dump(poulation_distribution, outfile)