In [1]:
import skmob
import statistics
import pandas as pd
import geopandas as gpd
import backend_codes.load_subsets as ls
import skmob.measures.individual as sk_id
from skmob.measures.collective import random_location_entropy

# Calculating user dict based mobility indictors
User dicts are derived via the skmob package.

For distance based statistics, we use the EPSG:20823 projection!

Different mobility metrics are calculated.

In [2]:
# Rolling Window Parameters
start = "20200406"
end = "20200409"
denom = "3days"
allow_even_subsets = False
sam = False
del_one_tweeters = True

In [3]:
ref = "statistics_notebookdemo/" + denom + "_overlap.csv"
if type(sam) == int:
    ref = ref.split('.csv')[0] + "_" +str(sam) + ".csv"

In [4]:
path = f"data/{ref}"
file = pd.read_csv(f'data/{ref}', index_col='start_date')

### Load in the neighborhood shapefiles
The neighborhoods of Rio are loaded and reprojected to the EPSG:20823.

Also the representative point (centroid, but always inside the respective polygon) is calculated.

In [5]:
barrios_path = 'data/shps/neighborhoods.shp'
barrios_20823 = gpd.read_file(barrios_path).to_crs(20823)

barrios_20823.CODBAIRRO = barrios_20823.CODBAIRRO.astype(int)
barrios_20823['geometry'] = barrios_20823.geometry.representative_point()# oder representative_point?
barrios_20823['lat'] = barrios_20823['geometry'].y
barrios_20823['lon'] = barrios_20823['geometry'].x

# Handle tweet geometries
First, load the tweets, reproject to EPSG:20823 and extract the lat and lon coordinates for the respective tweeets.

Also save the coordinates of the neighborhoods' representative point for each tweet (these are used later).

In [6]:
tweets_df = ls.load_and_subset(start, end, del_one_tweeters=del_one_tweeters, samp_size=sam, tweets_path="data/tweets/preprocessed_tweets_with_poi_location.csv")
tweets_df['wkt'] = gpd.GeoSeries.from_wkt(tweets_df.wkt).set_crs(4326).to_crs(20823)
tweets_df['lat'] = gpd.GeoSeries(tweets_df['wkt']).y
tweets_df['lon'] = gpd.GeoSeries(tweets_df['wkt']).x

In [7]:
tweets_df['lat_cod'] = tweets_df.cod.apply(lambda x: barrios_20823.loc[barrios_20823.CODBAIRRO == x].lat.values[0])
tweets_df['lon_cod'] = tweets_df.cod.apply(lambda x: barrios_20823.loc[barrios_20823.CODBAIRRO == x].lon.values[0])

In [8]:
stats = {}

Define a TrajectoryDataFrame with the skmob package.

In [9]:
tdf = skmob.TrajDataFrame(tweets_df, latitude='lat', longitude='lon', datetime='Timestamp', user_id='User_ID')

# Stats based on POI user dicts

### 0. Number of Trips
This also inculdes 'trips' from one district in the same. This is the whole number of tweets minus the number of users, since each users has the number of tweets - 1 trips.

In [10]:
stats['poi_number_of_total_trips'] = len(tweets_df) - tweets_df.User_ID.nunique()

### 1. Radius of gyration
To calculate the Radius of Gyration we use the poi attached to the tweet as well as its standart deviation. A user can have a radius of gyration of 0, if all tweets were sent from the same location.

In [11]:
if len(tdf) == 0:
    raise ValueError("Encountered empty subset!")

In [12]:
rg_df = sk_id.radius_of_gyration(tdf)
stats["mean_rog"] = rg_df.radius_of_gyration.mean()
stats['std_rog'] = rg_df.radius_of_gyration.std()

100%|██████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 1495.51it/s]


### 2. Jump lengths
We calculate three metrics. The mean distance of the mean travel distance between tweets for each user. The same, but only for users that actually moved. And the mean distance over all movements, without weighting it by users.

In [13]:
jl = sk_id.jump_lengths(tdf).jump_lengths

100%|██████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 1043.22it/s]


In [14]:
# Caluclated the mean for each user, save in the meaned variable
def mean2(x):
    if len(x) == 0:
        return 0
    else:
        return statistics.mean(x)

meaned = jl.apply(mean2)

In [15]:
stats["jl_simple_means_over_user_means"] = meaned.mean()
stats["jl_std_over_user_means"] = meaned.std()
stats["jl_simple_means_only_with_movement_user_means"] = meaned[meaned > 0].mean()
stats["jl_std_only_with_movement_user_means"] = meaned[meaned > 0].std()

all_moves = []
jl.apply(lambda x: all_moves.extend(x))
all_moves = pd.Series(all_moves)

stats['mean_total_distance'] = all_moves.sum() / stats['poi_number_of_total_trips']
stats['std_total_distance'] = all_moves.std()

### 3. Avg. inter-event time (time between tweets)
We calculate this once based on all tweets and once based on the mean of all user means.

In [16]:
wt = sk_id.waiting_times(tdf).waiting_times
stats['avg_avg_time_between_tweets_per_user'] = wt.apply(mean2).mean()
stats['avg_time_between_tweets_total'] = wt.apply(sum).sum() / stats['poi_number_of_total_trips']

100%|██████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 1496.77it/s]


### 4. Average amount of trips per user
Including trips to the same location.

In [17]:
stats['avg_number_of_trips_per_user'] = stats['poi_number_of_total_trips'] / tdf.uid.nunique()

### 5. Maximum distance
The mean over all users maximum distance

In [18]:
max_dist = sk_id.maximum_distance(tdf).maximum_distance
stats['mean_max_distance'] = max_dist.mean()
stats['std_max_distance'] = max_dist.std()

100%|██████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 1186.55it/s]


### 7. Number of locations
The mean over all users total visited locations

In [19]:
locs = sk_id.number_of_locations(tdf).number_of_locations
stats['mean_number_of_locations'] = locs.mean()
stats['std_number_of_locations'] = locs.std()

100%|███████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 822.10it/s]


### 8. Maximum distance from home

In [20]:
home_dist = sk_id.max_distance_from_home(tdf).max_distance_from_home
stats['mean_max_dist_from_home'] = home_dist.mean()
stats['std_max_dist_from_home'] = home_dist.std()

100%|███████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 211.83it/s]


### 9. Mean random location entropy
For the loaction entropy we need the neighborhoods, not the poi-coordinates.

In [21]:
#############################################
tweets_df['lat'] = tweets_df['lat_cod']
tweets_df['lon'] = tweets_df['lon_cod']
#############################################

In [22]:
tdf_barr = skmob.TrajDataFrame(tweets_df, latitude='lat', longitude='lon', datetime='Timestamp', user_id='User_ID')

In [23]:
stats['mean_random_location_entropy_barrios'] = random_location_entropy(tdf_barr).mean()[2]

100%|████████████████████████████████████████████████████████████████████████████████| 65/65 [00:00<00:00, 2035.71it/s]


### Write into .csv

In [24]:
start = int(start)
for name, val in stats.items():
    file.loc[start, name] = val
    
file.to_csv(f'data/{ref}')

In [25]:
for key, val in stats.items():
    print(key, ":", val)

poi_number_of_total_trips : 240
mean_rog : 4488.164655904539
std_rog : 5913.697306015925
jl_simple_means_over_user_means : 3283.5975167123306
jl_std_over_user_means : 4850.386303826439
jl_simple_means_only_with_movement_user_means : 8052.632005270718
jl_std_only_with_movement_user_means : 4380.498564011526
mean_total_distance : 2786.8369518013983
std_total_distance : 5086.792843633841
avg_avg_time_between_tweets_per_user : 43655.641412196914
avg_time_between_tweets_total : 30240.0
avg_number_of_trips_per_user : 2.3300970873786406
mean_max_distance : 4062.376615746517
std_max_distance : 5765.259167799827
mean_number_of_locations : 1.5339805825242718
std_number_of_locations : 0.8142786291520704
mean_max_dist_from_home : 4003.6544237807652
std_max_dist_from_home : 5661.241728569151
mean_random_location_entropy_barrios : 0.686327208847591
