In [None]:
import pandas as pd
import numpy as np

### Instantiate the social media dataset, and turn it into a social media post dataset that can be parsed by MAT-Builder.

In [None]:
city = 'paris'
social_dataset = pd.read_parquet(f'./enriched_occasional_{city}_posts.parquet')
display(social_dataset)
social_dataset = social_dataset.loc[:, ['uid', 'datetime', 'leaving_datetime', 'positive', 'negative']]

In [None]:
social_rome = pd.read_parquet('./tweets_rome.parquet')
display(social_rome.info())
display(social_rome)

### Create fictitious time instants for the social media posts, based on the time intervals spanned by the stops.

In [None]:
# Drop the columns that we don't need. 
social_dataset = social_dataset.loc[:, ['uid', 'datetime', 'leaving_datetime', 'positive', 'negative']]

# For each stop, choose a random time instant within the time interval covered by the stop.
start_int = social_dataset["datetime"].astype("int64")
end_int   = social_dataset["leaving_datetime"].astype("int64")
rand_ints = np.random.randint(low  = start_int.values,
                              high = end_int.values,
                              dtype = np.int64)
social_dataset["tweet_created"] = pd.to_datetime(rand_ints)
display(social_dataset)

### From the original social media dataset, for each stop choose either the positive or negative text given a probability $p$

In [None]:
p = 0.5
mask = np.random.rand(len(social_dataset)) < p
social_dataset["text"] = np.where(mask, social_dataset["positive"], social_dataset["negative"])

social_dataset.reset_index(names = 'tweet_id', inplace = True)
social_dataset = social_dataset.loc[:, ['tweet_id', 'uid', 'tweet_created', 'text']]
display(social_dataset)
display(social_dataset.info())

### Write the social media post dataset for MAT-Builder to disk

In [None]:
path_out = f'./tweets_{city}.parquet'
social_dataset.to_parquet(path_out)