In [1]:
import numpy as np
import pandas as pd
import notebooks_loops_script as nls
import backend_codes.load_subsets as ls

# Metadata
This notebook calculates simple metadata statistics for the respective rolling window. Since the notebook is only for demonstrative porposes, an example is shown here. The parameters of the example rolling windows are defined below.

In [2]:
start = "20200406"
end = "20200409"
denom = "3days"
allow_even_subsets = False
sam = False
del_one_tweeters = True

Six parameters have to be defined for each rolling window.
- start: the start date of the rolling window
- end: the end date (at 00:00) so tweets on this date are not included
- denom: this is for the name of the statistic-file that contains the results
- allow_even_subsets: This is relevent for calculating the middle of the respective rolling window
- sam: we also explored an approach using a fixed number of tweets for each rolling window of the same size. This can be defined through this parameter.
- del_one_tweeters: Since we need at least to tweets of a person in order to infer a movement, tweets of users who only sent one tweet in the respective rolling window are deleted

### Create Reference
Create a reference to name the file individually.

In [3]:
ref = "statistics_notebookdemo/" + denom + "_overlap.csv"
if type(sam) == int:
    ref = ref.split('.csv')[0] + "_" +str(sam) + ".csv"

In [4]:
path = f"data/{ref}"
nls.check_file(path)
file = pd.read_csv(f'data/{ref}', index_col='middle_date')

In [5]:
print(ref)

statistics_notebookdemo/3days_overlap.csv


### Load Tweets

In [6]:
tweets_df = ls.load_and_subset(start, end, del_one_tweeters=del_one_tweeters, samp_size=sam, tweets_path="data/tweets/preprocessed_tweets_with_poi_location.csv")

# Calculate Metadata and other Basic Statistics
Based on the start and enddate, we take a subset of our tweets, calculate metadata and other basic statistics and write them into the respective .csv file

In [7]:
stats = {}

In [8]:
stats['start_date'] = start
stats['end_date'] = end
stats["no_of_tweets"] = len(tweets_df)
stats["number_unique_users"] = tweets_df.User_ID.nunique()
stats["median_tweets_per_user"] = tweets_df.groupby('User_ID').Tweet_ID.count().median()
try:
    stats["mean_tweets_per_user"] = stats["no_of_tweets"] / stats["number_unique_users"]
except:
    stats["mean_tweets_per_user"] = np.nan

counted = tweets_df.groupby('User_ID').count()
stats["n_user_more_than_one_tweet"] = len(counted[counted['Timestamp'] > 1])
stats["n_users_with_more_than_one_location_point"] = sum(tweets_df.groupby('User_ID').nunique()['wkt'] > 1)
stats["n_users_with_more_than_one_cod"] = sum(tweets_df.groupby('User_ID').nunique()['cod'] > 1)

In [9]:
middle = nls.middeling(start, end, allow_even_subsets=allow_even_subsets)
middle = int(middle)

### Write into .csv
We load the existing .csv with by using our denomination name reference derived from the denomination.
Then we overwrite the old data.

create csv if non existing

In [10]:
for name, val in stats.items():
    file.loc[middle, name] = val

In [11]:
file.to_csv(f'data/{ref}')

In [12]:
for key, val in stats.items():
    print(key, ":", val)

start_date : 20200406
end_date : 20200409
no_of_tweets : 343
number_unique_users : 103
median_tweets_per_user : 2.0
mean_tweets_per_user : 3.3300970873786406
n_user_more_than_one_tweet : 103
n_users_with_more_than_one_location_point : 42
n_users_with_more_than_one_cod : 30
