# Exploration and analysis of mobility patterns

This notebooks aims at exploring and anaylzing mobility patterns derived from mobile phone records. Necessary inputs can be generated using the notebook "1_Retrieval_of_individual_human_movement_trajectories_and_collective_OD_matrices.ipynb"

Load packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from datetime import datetime
import geopandas as gpd
import networkx as nx
import contextily as cx 
from contextily import add_basemap 
from shapely.geometry import Point #<2.0.1
import folium
import geoplot 
import matplotlib.ticker as ticker
import os
import mapclassify as mc
import community 
from community import community_louvain
import networkx as nx
import matplotlib.cm as cm
from datetime import date
from sklearn.preprocessing import normalize
import itertools

# A) Temporal exploration of collected metadata
This subsection aims at visulizing temporal patterns of the collected metadata 

Load and display metadata

In [None]:
# Create array of metadata filenames
metadata_arrays = {}
for filename in os.listdir('../data/1_intermediate_output/metadata'):
    if filename.endswith('.csv'):
        metadata_arrays[filename] = pd.read_csv(os.path.join('../data/1_intermediate_output/metadata',filename), header=None, index_col=0)

# Create dataframe column with user counts
user_count_dict = {}
for key, value in metadata_arrays.items():
    user_count_dict[value.T.loc[value.T.index[0],'end_timestamp']] = (datetime.strptime(str(value.T.loc[value.T.index[0],'end_timestamp']), '%Y%m%d%H%M%S').strftime('%m/%d/%Y'), value.T.loc[value.T.index[0],'user_count'])
user_count_df = pd.DataFrame.from_dict(user_count_dict, orient='index').set_index([0])
user_count_df.columns = ['user_count']

# Create dataframe column with count of antenna connections
call_count_dict = {}
for key, value in metadata_arrays.items():
    call_count_dict[value.T.loc[value.T.index[0],'end_timestamp']] = (datetime.strptime(str(value.T.loc[value.T.index[0],'end_timestamp']), '%Y%m%d%H%M%S').strftime('%m/%d/%Y'), value.T.loc[value.T.index[0],'call_count'])
call_count_df = pd.DataFrame.from_dict(call_count_dict, orient='index').set_index([0])
call_count_df.columns = ['call_count']

# Create dataframe column with count of transitions (IET filtered)
transition_count_filtered_dict = {}
for key, value in metadata_arrays.items():
    transition_count_filtered_dict[value.T.loc[value.T.index[0],'end_timestamp']] = (datetime.strptime(str(value.T.loc[value.T.index[0],'end_timestamp']), '%Y%m%d%H%M%S').strftime('%m/%d/%Y'), value.T.loc[value.T.index[0],'transition_count_filtered'])
transition_count_filtered_df = pd.DataFrame.from_dict(transition_count_filtered_dict, orient='index').set_index([0])
transition_count_filtered_df.columns = ['transition_count_filtered']

# Create dataframe column with count of transitions (non-IET filtered)
transition_count_unfiltered_dict = {}
for key, value in metadata_arrays.items():
    transition_count_unfiltered_dict[value.T.loc[value.T.index[0],'end_timestamp']] = (datetime.strptime(str(value.T.loc[value.T.index[0],'end_timestamp']), '%Y%m%d%H%M%S').strftime('%m/%d/%Y'), value.T.loc[value.T.index[0],'transition_count_unfiltered'])
transition_count_unfiltered_df = pd.DataFrame.from_dict(transition_count_unfiltered_dict, orient='index').set_index([0])
transition_count_unfiltered_df.columns = ['transition_count_unfiltered']

# Merge dataframe columns to single dataframe and add 3 additional statistics
metadata_df = pd.concat([user_count_df, call_count_df, transition_count_filtered_df, transition_count_unfiltered_df], axis=1)
metadata_df['average amount of calls per user'] = metadata_df.call_count/metadata_df.user_count
metadata_df['proportion filtered to unfiltered transitions'] = metadata_df.transition_count_filtered/metadata_df.transition_count_unfiltered
metadata_df['weekly citywide penetration rate'] = metadata_df.user_count/6360689 # divided by the official population count of the city of Rio de Janeiro

# print metadata dataframe
metadata_df

Plot amount of mobile phone connections over time

In [None]:
plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plot_ = sns.lineplot(data=metadata_df.call_count, color="g", label='call_count')
plot_.xaxis.set_major_locator(ticker.LinearLocator(10))
ax2 = plt.twinx()
plt.ticklabel_format(style='plain', axis='y')
plot_ = sns.lineplot(data=metadata_df.user_count, color="b", label='user_count', ax=ax2).set_title('Amount of mobile phone connections')

Plot amount of call records per user

In [None]:
plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plot_=sns.lineplot(data=metadata_df["average amount of calls per user"]).set_title('Amount of call records per user')

Plot amount of measured movements

In [None]:
metadata_df['transition_count_unfiltered_moving_avg'] = metadata_df.transition_count_unfiltered.rolling(28).mean().shift(-14) 

plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plot_=sns.lineplot(data=metadata_df["transition_count_unfiltered"],label ='Movements').set_title('Amount of measured movements')

 Explore proportion of filtered to unfiltered transitions

In [None]:
# Barplot
transitions = metadata_df[['transition_count_filtered', 'transition_count_unfiltered']]
transitions.index.name = 'Day'
transitions.reset_index(inplace=True)
transitions
plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
sn1 = sns.barplot(x = 'Day', y = 'transition_count_unfiltered', data = transitions, color = 'red', label='unfiltered')
sn2 = sns.barplot(x = 'Day', y = 'transition_count_filtered', data = transitions, color = 'blue')
sn1.set(xlabel='Days', ylabel='Amount of Transitions')
sn2.xaxis.set_major_locator(ticker.LinearLocator(10))

Plot and retrieve amount of active antennas over time and add to metadata

In [None]:
stay_times_at_antennas = {}
for filename in os.listdir('../data/1_intermediate_output/stay_times_at_antennas'):
    if filename.endswith('.csv'):
        stay_times_at_antennas[filename] = pd.read_csv(os.path.join('../data/1_intermediate_output/stay_times_at_antennas',filename))

In [None]:
amount_of_antennas = []
for key, value in stay_times_at_antennas.items():
    amount_of_antennas.append(len(value))
metadata_df['amount_of_antennas'] = amount_of_antennas

# Plot
plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=45)
plot_ = sns.lineplot(data=metadata_df.amount_of_antennas, color="r", label='Amount of active Antennas')
plot_.xaxis.set_major_locator(ticker.LinearLocator(10))

# B) Spatial exploration using "stay_time_at_antennas.csv" files
This subsection explores the spatial insights stored in the "stay_time_at_antennas.csv" files. For spatio-temporal insights this section can be run in a loop over each time interval of analysis e.g. by iterating over stay_time_at_antennas.csv files.

Retrieve amount of connections per antenna

In [None]:
antenna_stats_df = pd.concat(stay_times_at_antennas.values(), ignore_index=True)
antenna_stats_df.drop(columns=[' mean',' stddev'], inplace=True)
antenna_stats_df = antenna_stats_df.groupby(['lat', ' lon']).sum().reset_index() # groupby + sum
antennas_gdf = gpd.GeoDataFrame(antenna_stats_df, geometry=gpd.points_from_xy(antenna_stats_df[' lon'], antenna_stats_df.lat))
antennas_gdf.set_crs("EPSG:4326", inplace=True).head()

Add mean duration time for each antenna

In [None]:
antenna_stats_df_2 = pd.concat(stay_times_at_antennas.values(), ignore_index=True)
antenna_stats_df_2.drop(columns=[' count',' stddev'], inplace=True)
antenna_stats_df_2 = antenna_stats_df_2.groupby(['lat', ' lon']).mean().reset_index() # groupby + mean
antennas_gdf_2 = gpd.GeoDataFrame(antenna_stats_df_2, geometry=gpd.points_from_xy(antenna_stats_df_2[' lon'], antenna_stats_df_2.lat))
antennas_gdf_2.set_crs("EPSG:4326", inplace=True).head()

Get study region 

In [None]:
study_area = gpd.read_file("../data/0_input_data/study_region/study_region_RJ.geojson")
study_area.to_crs(epsg=4326, inplace=True)

Filter out antennas outside of study region

In [None]:
antennas_in_RJ = antennas_gdf[antennas_gdf.geometry.within(study_area.geometry.unary_union)]
antennas_in_RJ_2 = antennas_gdf_2[antennas_gdf_2.geometry.within(study_area.geometry.unary_union)]
antennas_in_RJ['mean'] = antennas_in_RJ_2[' mean']

Plot bubble map of connections per antenna

In [None]:
ax = geoplot.polyplot(study_area, facecolor="white", figsize=(30, 15))
geoplot.pointplot(antennas_in_RJ, hue=" count", scale=" count", cmap="Reds", ax=ax, edgecolor="black", legend=True, legend_var="scale",
                  legend_kwargs={"loc":"best", "fontsize": "large", "title":"Connections per antenna",  "title_fontsize":"large"},
                  # define absolute numbers in legend and remove bbox
                  limits=(5, 40),)
plt.title("Connections per antenna vary with density", fontdict={"fontsize": 20}, pad=15)

Retrieve and plot amount of connections per admin zone

In [None]:
antennas_in_admin = gpd.sjoin(antennas_in_RJ, study_area, op='within')
antennas_in_admin.drop(columns=['lat',' lon','geometry', 'mean', 'index_right','Área', 'NOME'], inplace=True)
antenna_connections_in_admin = antennas_in_admin.groupby(['CODBAIRRO']).sum().reset_index()
result = study_area.set_index('CODBAIRRO').join(antenna_connections_in_admin.set_index('CODBAIRRO'))
result['count/area'] = result[' count'].div(result['Área'])

# PLot
fig, axes = plt.subplots(ncols=2, figsize=(25, 12))
ax1, ax2 = axes
ax1.set_title('Absolute connection per admine zone')
result.plot(" count", edgecolor='lightgrey', cmap="YlOrRd", ax=ax1, legend=True, scheme='quantiles') 
ax2.set_title('Relative connections per admin zone')
result.plot("count/area", edgecolor='lightgrey', cmap="YlOrRd", ax=ax2, legend=True, scheme='quantiles') 

Retrieve amount of connections per land use class

In [None]:
# Get Land Use map
LULC = gpd.read_file("../data/0_input_data/LULC/LULC_RJ.geojson")
LULC.to_crs(epsg=4326, inplace=True)

# Aggregated antenna connections on land use classes
antennas_in_LULC = gpd.sjoin(antennas_in_RJ, LULC, op='within')
antennas_in_LULC.drop(columns=['lat',' lon','geometry', 'mean', 'index_right', 'OBJECTID', 'Grupo', 'RuleID','Ano','RuleID_1','BaseGeoDBO','ShapeSTAre','ShapeSTLen'], inplace=True)
antenna_connections_in_LULC = antennas_in_LULC.groupby(['UsoAgregad']).sum().reset_index()
antenna_connections_in_LULC['percentual'] = (antenna_connections_in_LULC[' count']/antenna_connections_in_LULC[' count'].sum())*100
antenna_connections_in_LULC

Plot average connection time per antenna

In [None]:
# Plot
scheme = mc.Quantiles(antennas_in_RJ['mean'], k=5)
ax = geoplot.polyplot(study_area, facecolor="white", figsize=(30, 15))
geoplot.pointplot(antennas_in_RJ, hue="mean", scheme=scheme, scale="mean", cmap="YlGnBu", ax=ax, edgecolor="black", legend=True, legend_var="scale",
                  legend_kwargs={"loc":"best", "fontsize": "large", "title":"Antenna connection time",  "title_fontsize":"large"}, limits=(5, 40),) # limits specify circle size
plt.title("Average antenna connection time", fontdict={"fontsize": 15}, pad=15);

Retrieve and plot average connection time per admin zone

In [None]:
antennas_in_admin = gpd.sjoin(antennas_in_RJ, study_area, op='within')
antennas_in_admin.drop(columns=['lat',' lon','geometry', ' count', 'index_right','Área', 'NOME'], inplace=True)
antenna_connections_in_admin = antennas_in_admin.groupby(['CODBAIRRO']).mean().reset_index()
result = study_area.set_index('CODBAIRRO').join(antenna_connections_in_admin.set_index('CODBAIRRO'))
result

# PLot
fig, ax = plt.subplots(figsize=(25,10))
ax.set_title('Average Connection Time per Admin Zone')
result.plot("mean", edgecolor='lightgrey', cmap="YlGnBu", ax=ax, legend=True, scheme='quantiles') 

# C) Spatial exploration of Origin-Destination (OD) matrices
For spatio-temporal insights this section can be run in a loop over each time interval of analysis. OD matrices can also be aggregated over time.

Example of temporal aggregation of admin to admin OD matrices

In [None]:
admin2admin_arrays = {}
for filename in os.listdir('../data/1_intermediate_output/admin2admin'):
    if filename.endswith('.npy'):
        admin2admin_arrays[filename] = np.load(os.path.join('../data/1_intermediate_output/admin2admin',filename))

# Example for aggregation on a weekly basis. The "timestamps_weekly.csv" can be generated manually.
admin2admin_arrays_weekly_aggregated = {}
# Load timestamps weekly sorted
timestamps_weekly = pd.read_csv('../data/0_input_data/timestamps/timestamps_weekly.csv', delimiter='\t', encoding='utf-8', header=None, dtype=str)
# iterate over rows
for index, row in timestamps_weekly.iterrows():
    # Make all elements of a row to a list
    liste = list(row)
    # Iterate over list elements (=days) and retireve arrays from dict
    weekly_arrays_list = []
    for element in liste:
        element_value = [v for k, v in admin2admin_arrays.items() if k.startswith('admin2admin_' + str(element))]
        # If array exist add to list of weekly arrays
        if element_value:
            weekly_arrays_list.append(element_value[0])           
    # If week complete - aggregate and add to new dict
    if len(weekly_arrays_list) == 7:      
        admin2admin_arrays_weekly_aggregated[str(liste[6])] = sum(weekly_arrays_list)
admin2admin_arrays_weekly_aggregated # store and continue temporal analysis somewhere else

We can select single OD matrices from these aggregates. Here we are selecting an OD matrix from the first week. 

In [None]:
admin2admin = list(admin2admin_arrays_weekly_aggregated.items())[0][1]
admin2admin

Calculate normalized inflow, ouflow, and inflow/outflow ratio of neighborhoods to identify net exporters (sources) and importers (sinks)

In [None]:
# Load demographic data for normalization
pop = pd.read_csv("../data/0_input_data/population/population.csv", delimiter=';')
pop.set_index('CODBAIRRO', inplace=True)

# Calculate inflow and normalize by population
inflow = admin2admin.sum(axis=0)                                           # take sum over rows
inflow = pd.DataFrame(inflow)
inflow.index = inflow.index + 1                                            # shifting index
inflow['admin_population'] = pop['Pop_health_pdf']
inflow = inflow.iloc[:,:-1].div(inflow.admin_population, axis=0)

# Calculate inflow and normalize by population
outflow = admin2admin.sum(axis=1)                                          # take sum over columns
outflow = pd.DataFrame(outflow)
outflow.index = outflow.index + 1                                          # shifting index
outflow['admin_population'] = pop['Pop_health_pdf']
outflow = outflow.iloc[:,:-1].div(outflow.admin_population, axis=0)
outflow

# Add values to geometry
regions_df = gpd.read_file("../data/0_input_data/study_region/study_region_RJ.geojson")
regions_df.set_index('CODBAIRRO', inplace =True)
regions_df.sort_index(inplace=True)
regions_df['Inflow']=admin2admin.sum(axis=0)
regions_df['Outflow']=admin2admin.sum(axis=1)
regions_df['Inflow_Outflow_Ratio']= regions_df['Inflow']/regions_df['Outflow']
regions_df = pd.concat([regions_df, inflow], axis=1)
regions_df.rename(columns={0: "Inflow_to_Admin_Population_Ratio"}, inplace=True)
regions_df = pd.concat([regions_df, outflow], axis=1)
regions_df.rename(columns={0: "Outflow_to_Admin_Population_Ratio"}, inplace=True)

Plot inflow

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(25, 12))
ax1, ax2 = axes
ax1.set_title('Absolute Inflow')
regions_df.plot("Inflow", edgecolor='lightgrey', cmap="Greens", ax=ax1, legend=True, scheme='quantiles') 
ax2.set_title('Relative Inflow')
regions_df.plot("Inflow_to_Admin_Population_Ratio", edgecolor='lightgrey', cmap="Greens", ax=ax2, legend=True, scheme='quantiles') 

Plot outflow 

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(25, 12))
ax1, ax2 = axes
ax1.set_title('Absolute Outflow')
regions_df.plot("Outflow", edgecolor='lightgrey', cmap="Reds", ax=ax1, legend=True, scheme='quantiles') 
ax2.set_title('Relative Outflow')
regions_df.plot("Outflow_to_Admin_Population_Ratio", edgecolor='lightgrey', cmap="Reds", ax=ax2, legend=True, scheme='quantiles') 

Plot sinks and sources

In [None]:
regions_df['Net_Exporters_Sources'] = np.where(regions_df['Inflow_Outflow_Ratio']<=1 , np.nan, regions_df['Inflow_Outflow_Ratio'])
regions_df['Net_Importers_Sinks'] = np.where(regions_df['Inflow_Outflow_Ratio']>1 , np.nan, regions_df['Inflow_Outflow_Ratio'])
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Sinks (green) and Sources (red)')
regions_df.plot("Net_Exporters_Sources", edgecolor='lightgrey', cmap="Reds", ax=ax, scheme='quantiles') 
regions_df.plot("Net_Importers_Sinks", edgecolor='lightgrey', cmap="Greens", ax=ax, scheme='quantiles') 

We can use the Louvain algorithm to retrieve the mobility metric of graph modularity (exemplary implementation for a selected tower2tower matrix)

In [None]:
tower2tower_arrays = {}
for filename in os.listdir('../data/1_intermediate_output/tower2tower'):
    if filename.endswith('.npy'):
        tower2tower_arrays[filename] = np.load(os.path.join('../data/1_intermediate_output/tower2tower',filename))

Select first tower2tower OD matrix of array

In [None]:
tower2tower = list(tower2tower_arrays.items())[0][1] # 0 indicated day
tower2tower  

In [None]:
# Compute best partition
G = nx.from_numpy_matrix(tower2tower)
partition = community_louvain.best_partition(G, weight='weight')

# Load tesselation area of that day and convert to epsg 4326
antenna_tesselation = gpd.read_file('../data/01_intermediate_output/antenna_tesselations/tesselations_20200406234003.shp') # should be adjusted as needed
antenna_tesselation.to_crs("EPSG:4326", inplace=True)

# Add community column to geodataframe
partition_df = pd.DataFrame.from_dict(partition, orient='index').reset_index()
partition_df.columns = ['antenna_id','Community']

# Match antenna_id & community to FID (tesselation id), becasue tesselation id != antenna id
tesselations_df = pd.merge(antenna_in_tesselation, partition_df, on=['antenna_id']) # get community for each antenna via antenna_id
tesselations_df = pd.merge(tesselations_df, antenna_tesselation, on=['FID']) # get geometry of tesselations via FID

# Plot 
fig, ax = plt.subplots(figsize=(25,10))
ax.set_title('Communities' + ' (Modularity = ' + str(community.modularity(partition, G)) + ')')
tesselations_df.plot("Community", edgecolor='lightgrey', cmap="tab10", ax=ax, legend=True) 

Calculate graph modularity over temporally sequential OD matrices

In [None]:
modularity_dict = {}
for key, value in tower2tower_arrays.items():
    G = nx.from_numpy_matrix(value)
    partition = community_louvain.best_partition(G, weight='weight')
    modularity_dict[key] = community.modularity(partition, G)

Plot modularity indicator over time

In [None]:
# Build df for plotting
modularity_df = pd.DataFrame(modularity_dict.items())
modularity_df['date'] = pd.to_datetime(modularity_df[0].str[12:-10])
modularity_df.set_index('date', inplace=True)
modularity_df.rename(columns={1:'Modularity'}, inplace=True)
modularity_df.drop(columns=[0], inplace=True)
modularity_df

# PLot
plt.figure(figsize=(25,5))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=90)
plot_=sns.lineplot(data=modularity_df["Modularity"])
plot_.xaxis.set_major_locator(ticker.LinearLocator(10))

We can also visualize OD matrices as heatmaps

In [None]:
normed_admin2admin = normalize(admin2admin, axis=1, norm='l1') # Normalization
row_standardized_admin2admin = admin2admin/admin2admin.sum(axis=1, keepdims=True) # row Standardization

# Plot
sns.set()
fig, ax = plt.subplots(figsize = (20,20))
im = sns.heatmap(row_standardized_admin2admin, linewidths=.5, ax=ax, vmin=0, vmax=0.005, square=True,  cmap='Greens',fmt='.3f', cbar=True, cbar_kws={'extend': 'max'})

We can also generate data files to visualize OD matrices via the Flowmap.blue tool (example: admin to admin)

In [None]:
admin2admin_df = pd.DataFrame(admin2admin)
a = list(range(1, 164))
b = list(range(1, 164))
c = list(itertools.product(a, b))
c = pd.DataFrame(c, columns=['Origin', 'Destination'])
d = list()
for index, row in c.iterrows():
    d.append(round(admin2admin_df.iloc[row['Origin']-1,row['Destination']-1]))
c['Flow'] = d
c.to_csv('../data/2_final_output/FlowmapBlue_input_files/flow.csv', sep=',', index=False)

# Create locations.csv
locations= regions_df[['NOME', 'geometry']]
locations['lat'] = locations['geometry'].centroid.to_crs(epsg=4326).y
locations['lon'] = locations['geometry'].centroid.to_crs(epsg=4326).x
locations.drop(columns=['geometry'], inplace=True)
locations.to_csv('../data/2_final_output/FlowmapBlue_input_files/locations.csv', sep=',', index=True, encoding='utf-8')