In [18]:
import os
import copy
import numpy as np
import pandas as pd
from datetime import datetime as dt

# Preparing Twitter and mobile phone data from plotting
This notebook prepares the metrics that were calculated in the notebooks before for plotting. All the different metrics are put in a similar fashion.

### Set Variables
Define the periods for onsets and offset.

In [19]:
sonoff1, sonoff2, sonoff3 = dt(2020, 4, 6), dt(2021, 4, 6), dt(2022, 4, 6)
eonoff1, eonoff2, eonoff3 = dt(2020, 6, 6), dt(2021, 6, 6), dt(2022, 6, 6)

sonoff1, sonoff2, sonoff3 = "20200406", "20210406", "20220406"
eonoff1, eonoff2, eonoff3 = "20200606", "20210606", "20220606"

pre_subset = "2020-10-01"

# Post-processing OD-matrices
First, the differet OD-matrices are normalized. For the study we need the matrices of the 3 on/offset periods and the matrix of the total study period for twitter and mobile phone data.

In [20]:
ods = {}

### Load in Twitter OD-Matrices

In [21]:
ods["total_twitter_od"] = np.load(r"data\movement_matrices\total\mm_20210603.npy")

### Load in mobile phone data OD-matrix
Load in the mobile phone data movement matrices

In [22]:
ods["total_mpd_od"] = np.load("data/movement_matrices/mobile_phone_data/total.npy")

### Normalize OD-matrices

In [23]:
def normalize(matrix, set_diag_to_0=True):
    if not matrix.shape[0] == matrix.shape[1]:
        raise ValueError("WRONG MATRIX SHAPE!!")
        
    for i in range(matrix.shape[0]):
        matrix[i,i] = 0
        
    return matrix / matrix.sum()

ods_norm = {}

# Overwrite the matrices with normalized matrices
for key, val in ods.items():
    ods_norm[key] = normalize(val)

### Write Normalized OD-Matrices

In [24]:
path = "data/post_processing/"

for key, val in ods_norm.items():
    np.save(path+key+"_normalized.npy", val)

# Post processing for temporal plots
All the metrics from the notebook before are generalized in three manners. Scaling (putting the data between 0 and 1 for comparability), normalizing (changing the values based on tweet amount) and a rolling average (one with 14 and one with 28 days).

The data is always scaled, however for experimatal purposes we also caluculated without normalizing or rolling averages. This is adressed in the paper and some example plots can also be found in its appendix.

Since the data seems to be influenced by the tweet volume, we also calculated alle the metrics with a fixed amount of tweets. So all the B-notebooks were calculated with a steady sample number of tweets depending on the rolling window size. E.g. for 7-days it was 1115 tweets. This represents the 98th percentile of rolling windows with more than this number of tweets. The effects of this are also discussed in the paper.

The main points in the paper however are made with the full tweet volume with scaling, without normalizating (since we could not find enough evidence that the data really is influenced by tweet volume to justify a normalization) and a 28 day moving average.

### Load in twitter data

In [25]:
statistics_twitter = {}
# Load only subsets
p = "data/statistics/"
for file in [file for file in os.listdir(p) if ('days' in file)]:
    if 'statistics' in file and "notebookdemo" not in file:
        data = pd.read_csv(p+file)
        if str(data['start_date'][0])[-2:] == ".0":
            data['start_date'] = data['start_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d.0"))
        else:
            data['start_date'] = data['start_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d"))
        data['middle_date'] = data['middle_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d"))
        data = data.set_index('middle_date')
        data = data.drop(columns=["Unnamed: 1"])
        statistics_twitter[file.split("_")[1].split(".")[0]] = data

### Load in sample based twitter data

In [26]:
statistics_twitter_sample = {}
# Load only subsets
p = "data/sample_based_statistics/"
for file in [file for file in os.listdir(p) if ('days' in file)]:
    if 'statistics' in file:
        data = pd.read_csv(p+file)
        if str(data['start_date'][0])[-2:] == ".0":
            data['start_date'] = data['start_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d.0"))
        else:
            data['start_date'] = data['start_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d"))
        data['middle_date'] = data['middle_date'].apply(lambda x: dt.strptime(str(x), "%Y%m%d"))
        data = data.set_index('middle_date')
        data = data.drop(columns=["Unnamed: 1"])
        statistics_twitter_sample[file.split("_")[1].split(".")[0]] = data

### Load in mobile phone data
Load in the mpd metrics and rename the columns.

In [27]:
data_path = r"data\statistics\mobility_metrics_paper.csv"
mpd = pd.read_csv(data_path, sep=';', decimal=',')
mpd['date'] = mpd.date.apply(lambda x: dt.strptime(str(x), "%Y-%m-%d 00:00:00"))
mpd = mpd.set_index('date')

In [28]:
mpd.columns = ['number_unique_users', 'weekly citywide penetration rate', 'no_real_movements', 'jl_simple_means_over_user_means',\
                  'graph_modularity', 'rel_tweets_in_residential_areas', 'mean_rog']

### Define functions
We define the functions for scaling and normalizing.
MPD contains a series of zeros, which were replaced witt nan values to avoid distorting the graph.

(Normalizing has to be done in different approaches. Since the graph modularity moves inversely correlated with tweet volume it is multiplied instead of devided and the relative tweet amount in residential areas is independent of tweet amount by nature.)

In [29]:
def scale0to1_column(s):
    s = copy.deepcopy(s)
    column_name = s.name
    s[s == 0] = np.nan
    s_valid = s[s.notna()]
    min_value = s_valid.min()
    max_value = s_valid.max()
    s = (s - min_value) / (max_value - min_value)
    return s

In [30]:
def export(sub, folder, key, samples=False):
    if not os.path.exists(f'data/post_processing/twitter/{folder}/'):
        os.mkdir(f'data/post_processing/twitter/{folder}/')
    path = f'data/post_processing/twitter/{folder}/{key}.csv'
    
    if samples:
        if not os.path.exists(f'data/post_processing/sample_based/{folder}/'):
            os.mkdir(f'data/post_processing/sample_based/{folder}/')
        path = f'data/post_processing/sample_based/{folder}/{key}.csv'
    sub.to_csv(path)

### All rolling window combinations - create and export - also sample based
For each rolling window (1 to 31 days) all relevant combinations of scaling normalizing and rolling averages are created and exported.

In [31]:
if not os.path.exists(f'data/post_processing/twitter'):
    os.mkdir(f'data/post_processing/twitter')
if not os.path.exists(f'data/post_processing/sample_based'):
    os.mkdir(f'data/post_processing/sample_based')

In [32]:
wanted_stats = ['number_unique_users', 'no_real_movements', 'jl_simple_means_over_user_means',\
                  'graph_modularity', 'rel_tweets_in_residential_areas', 'mean_rog']

sizes = [28]

for stats, samples in zip([statistics_twitter, statistics_twitter_sample], [False, True]):
    for key, val in stats.items():
        numtweets = val['no_of_tweets']

        sub_og = copy.deepcopy(val[wanted_stats])

        # only scaled
        sub = copy.deepcopy(sub_og)
        sub = sub.apply(scale0to1_column)
        export(sub, 'scaled', key, samples)

        # trend, scaled
        sub = copy.deepcopy(sub_og)
        for size in sizes:
            rm_sub = sub.apply(lambda x: x.rolling(size, center=True).mean())
            rm_sub = rm_sub.apply(scale0to1_column)
            export(rm_sub, 'trend_scaled', key+"_"+str(size), samples)

### Trend, then scaled - MPD
For mobile phone data, we do not explore every combination.

In [33]:
# Make trends and scale
mpds_trends = {}

for size in sizes:
    temp = mpd.apply(lambda x: x.rolling(size, center=True).mean())
    mpds_trends[str(size)] = temp.apply(scale0to1_column)

In [34]:
# Only scaled, without trend
mpd_scaled = mpd.apply(scale0to1_column)
mpd_scaled.to_csv('data/post_processing/mpd_scaled.csv', index=True)

for key, val in mpds_trends.items():
    val.to_csv(f'data/post_processing/mpd_trend{key}_scaled.csv')