In [None]:
from utils import DATA_DIR
import json
from tqdm import tqdm
from time import sleep

import tweepy
from utils.bearer_token import BEARER_TOKEN

In [3]:
client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)

## Query
In this section, we're going to build the query used for scraping Tweets relating to SAMEs for a specific event.
In particular, we're going to take the severe windstorm that struck Quebec and Ontario on December 12 2021 before moving on to Newfoundlound.

---

Here are some references about this event:
- https://www.ctvnews.ca/climate-and-environment/windstorm-deals-major-damage-to-ontario-and-quebec-heads-for-n-l-1.5704273

Note: many concurring winter events were happening across Canada around that time:
- https://www.cbc.ca/news/canada/british-columbia/snowfall-warning-southern-bc-1.6289542
- https://globalnews.ca/news/8444048/calgary-snow-squall-warning/
- https://www.saltwire.com/atlantic-canada/weather/allister-aalders-more-wet-windy-weather-sweeping-through-atlantic-canada-100667293/

Also, in the UK:
- https://www.bbc.com/news/uk-england-59552097

In [4]:
# Here we define the daterange for scraping.
# The approach I've taken here is taking the Friday and Monday around an event
# to include the weekends where more people might be tweeting before or after
# the event. In general, we might want to figure out how many days before and
# afer an event relevant tweets occur.

start_time="2021-12-04T00:00:00.000Z" # start of friday
end_time="2021-12-20T00:00:00.000Z" # end of monday

In [5]:
# Here we build the seed query composed of SAME warnings
# relevant to the event (NOTE: these are event=specific and should be adapted to each event).

warnings = [
  "storm warning",
  "snowfall warning",
  "rainfall warning",
  "freezing rain warning",
  "wind watch",
  "squall watch",
  "wind warning",
  "squall warning",
  "winter weather travel advisory"
]
query = ' OR '.join([f'"{x}"' for x in warnings])

# we also specify not to include retweets
query = f"({query}) -is:retweet"

In [6]:
# By default, the twitter API will only return a limited amount of data
# for a given tweet. To expand what is returned, we use fields and expansions.
# see: https://docs.tweepy.org/en/v4.8.0/client.html#expansions-and-fields-parameters

# see: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
tweet_fields = [
  "author_id",
  "conversation_id",
  "created_at",
  "geo",
  "in_reply_to_user_id",
  "lang",
  "public_metrics",
  "referenced_tweets",
  "source",
  "reply_settings",
]

# see: https://developer.twitter.com/en/docs/twitter-api/expansions
expansions = [
  "author_id",
  "geo.place_id"
]

# see: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
user_fields = [
  "verified"
]

# see: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place
place_fields = [
  "place_type"
]

## Scraping
To create our dataset of tweets and interactions, we will first scrape a set of "seed" tweets.
Here, a seed tweet is simply a tweet which contains a SAME relevant to the event.
We can then scrape users who have interacted with these tweets (via likes or retweets) as well as other tweets (via conversations).
The resulting dataset of seed tweets and interacting users/tweets can then provide useful insights into the dissemination SAMEs on social media.

In [7]:
scraping_dir = DATA_DIR / "2021_12_12_CA/raw"
scraping_dir.mkdir(exist_ok=True, parents=True)

### Scraping: seed tweets

In [8]:
seed_tweet_dir = scraping_dir / "seed_tweets"
seed_tweet_dir.mkdir(exist_ok=True)

search_kwargs = dict(
    query=query,
    tweet_fields=tweet_fields,
    expansions=expansions,
    user_fields=user_fields,
    start_time=start_time,
    end_time=end_time,
    max_results=100
)


for response in tqdm(tweepy.Paginator(client.search_all_tweets, **search_kwargs)):
  # prevent 1 request / s rate limit to trip up 15min wait due to 300 request / 15 min rate limit
  # see: https://github.com/tweepy/tweepy/issues/1871
  sleep(1)
  for i,tweet in enumerate(response.data):
    json_data = tweet.data
    for expansion in response.includes.keys():
      # TODO: figure out how to align places with tweets
      #       in cases where places might have e.g. a list of length 1
      #       compared to the list of length 100 for tweets
      if len(response.includes[expansion]) == len(response.data):
        for k,v in response.includes[expansion][i].items():
          json_data[f"_{expansion}_{k}"] = v

    tweet_path = seed_tweet_dir / f"{json_data['id']}.json"
    tweet_path.write_text(json.dumps(json_data, indent=2))

288it [07:05,  1.48s/it]


### Scraping: reply tweets

In [7]:
seed_tweet_dir = scraping_dir / "seed_tweets"
reply_dir = scraping_dir / "reply_tweets"
reply_dir.mkdir(exist_ok=True)

num_retry = 3

already_searched_tweets_p = reply_dir / "already_searched_tweets.txt"
already_searched_tweets_p.touch(exist_ok=True)
already_searched_tweets = already_searched_tweets_p.read_text().splitlines()
already_searched_tweets = set(already_searched_tweets)

for tweet_json in tqdm(list(seed_tweet_dir.glob("*.json"))):
  tweet = json.loads(tweet_json.read_text())
  conversation_id = tweet['conversation_id']

  if conversation_id in already_searched_tweets:
    continue

  # skip tweets/conversations with 0 replies
  num_replies = tweet['public_metrics']['reply_count']
  if not (conversation_id == tweet['id'] and num_replies == 0):
    reply_kwargs = dict(
      query=f"conversation_id:{conversation_id}",
      tweet_fields=tweet_fields,
      expansions=expansions,
      user_fields=user_fields,
      max_results=100,
    )
    for attempt in range(num_retry):
      try:
        for response in tweepy.Paginator(client.search_all_tweets, **reply_kwargs):
          sleep(1)
          for reply in response.data or []:
            json_data = reply.data
            reply_path = reply_dir / f"{json_data['id']}.json"
            reply_path.write_text(json.dumps(json_data, indent=2))
      except Exception as e:
        if attempt < num_retry:
          client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
          continue
        else:
          raise e
      else:
        break

  already_searched_tweets.add(conversation_id)
  txt = "\n".join([str(x) for x in already_searched_tweets])
  already_searched_tweets_p.write_text(txt)

100%|██████████| 28424/28424 [00:03<00:00, 7475.13it/s]


### Scraping: liking users

In [7]:
seed_tweet_dir = scraping_dir / "seed_tweets"
liking_user_dir = scraping_dir / "liking_users"
liking_user_dir.mkdir(exist_ok=True)

user_fields = [
  "verified",
  "created_at",
  "description",
  "location",
  "protected",
  "public_metrics",
]

num_retry = 3

already_searched_tweets = set()
for user_json in tqdm(list(liking_user_dir.glob("*.json"))):
  user = json.loads(user_json.read_text())
  for tweet_id in user['_source_tweet_id']:
    already_searched_tweets.add(tweet_id)

for tweet_json in tqdm(list(seed_tweet_dir.glob("*.json"))):
  tweet = json.loads(tweet_json.read_text())
  tweet_id = tweet['id']
  if  tweet_id in already_searched_tweets:
    continue

  elif tweet['public_metrics']['like_count']:
      glu_kwargs = dict(
        id=tweet_id,
        user_fields=user_fields
      )
      for attempt in range(num_retry):
        try:
          for response in tweepy.Paginator(client.get_liking_users, **glu_kwargs):
            sleep(2)
            for user in response.data or []:
              json_data = user.data
              user_path = liking_user_dir / f"{json_data['id']}.json"
              if user_path.exists():
                json_data = json.loads(user_path.read_text())
              else:
                json_data['_source_tweet_id'] = []
              json_data['_source_tweet_id'].append(tweet_id)
              user_path.write_text(json.dumps(json_data, indent=2))
        except Exception as e:
          if attempt < num_retry:
            client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
            continue
          else:
            raise e
        else:
          break

100%|██████████| 30515/30515 [00:04<00:00, 6821.30it/s]
 29%|██▉       | 8195/28424 [02:41<05:57, 56.56it/s]Rate limit exceeded. Sleeping for 739 seconds.
 54%|█████▍    | 15392/28424 [33:58<06:06, 35.58it/s]  Rate limit exceeded. Sleeping for 740 seconds.
 78%|███████▊  | 22306/28424 [1:05:23<02:01, 50.37it/s]   Rate limit exceeded. Sleeping for 739 seconds.
 85%|████████▍ | 24068/28424 [1:20:22<1:05:36,  1.11it/s]Rate limit exceeded. Sleeping for 739 seconds.
 85%|████████▌ | 24238/28424 [1:35:23<46:10,  1.51it/s]   Rate limit exceeded. Sleeping for 740 seconds.
 86%|████████▌ | 24377/28424 [1:50:26<2:01:17,  1.80s/it] Rate limit exceeded. Sleeping for 740 seconds.
 86%|████████▌ | 24500/28424 [2:21:41<41:35,  1.57it/s]     Rate limit exceeded. Sleeping for 739 seconds.
 87%|████████▋ | 24675/28424 [2:36:48<58:10,  1.07it/s]   Rate limit exceeded. Sleeping for 739 seconds.
 87%|████████▋ | 24847/28424 [3:08:04<53:20,  1.12it/s]     Rate limit exceeded. Sleeping for 740 seconds.
 88%|

### Scraping: retweeting users

In [22]:
seed_tweet_dir = scraping_dir / "seed_tweets"
rt_user_dir = scraping_dir / "rt_users"
rt_user_dir.mkdir(exist_ok=True)

user_fields = [
  "verified",
  "created_at",
  "description",
  "location",
  "protected",
  "public_metrics",
]

num_retry = 3

already_searched_tweets = set()
for user_json in tqdm(list(rt_user_dir.glob("*.json"))):
  user = json.loads(user_json.read_text())
  for tweet_id in user['_source_tweet_id']:
    already_searched_tweets.add(tweet_id)

for tweet_json in tqdm(list(sorted(seed_tweet_dir.glob("*.json")))):
  tweet = json.loads(tweet_json.read_text())
  tweet_id = tweet['id']
  if  tweet_id in already_searched_tweets:
    continue

  elif tweet['public_metrics']['retweet_count']:
      glu_kwargs = dict(
        id=tweet_id,
        user_fields=user_fields
      )
      for attempt in range(num_retry):
        try:
          for response in tweepy.Paginator(client.get_retweeters, **glu_kwargs):
            sleep(1)
            for user in response.data or []:
              json_data = user.data
              user_path = rt_user_dir / f"{json_data['id']}.json"
              if user_path.exists():
                json_data = json.loads(user_path.read_text())
              else:
                json_data['_source_tweet_id'] = []
              json_data['_source_tweet_id'].append(tweet_id)
              user_path.write_text(json.dumps(json_data, indent=2))
              already_searched_tweets.add(tweet_id)
        except Exception as e:
          if attempt < num_retry:
            client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
            continue
          else:
            raise e
        else:
          break

100%|██████████| 8874/8874 [00:01<00:00, 6138.57it/s]
  0%|          | 121/28424 [00:00<00:30, 916.88it/s]


KeyboardInterrupt: 

### Scraping: seed tweet users

In [9]:
seed_tweet_dir = scraping_dir / "seed_tweets"
seed_user_dir = scraping_dir / "seed_users"
seed_user_dir.mkdir(exist_ok=True)

user_fields = [
  "verified",
  "created_at",
  "description",
  "location",
  "protected",
  "public_metrics",
]

num_retry = 3

already_searched_users = set()
for user_json in tqdm(list(seed_user_dir.glob("*.json"))):
  author_id = user_json.stem
  already_searched_users.add(author_id)

user_queue = []
for tweet_json in tqdm(list(sorted(seed_tweet_dir.glob("*.json")))):
  tweet = json.loads(tweet_json.read_text())
  author_id = tweet['author_id']
  if  author_id in already_searched_users:
    continue
  # batch queries to max of 100 ids to reduce API overhead
  elif len(user_queue) < 100:
    user_queue.append(author_id)
    already_searched_users.add(author_id)
  else:
      glu_kwargs = dict(
        ids=user_queue,
        user_fields=user_fields
      )
      user_queue = []
      for attempt in range(num_retry):
        try:
          response = client.get_users(**glu_kwargs)
          sleep(1)
          for user in response.data or []:
            json_data = user.data
            user_path = seed_user_dir / f"{json_data['id']}.json"
            user_path.write_text(json.dumps(json_data, indent=2))
        except Exception as e:
          if attempt < num_retry:
            client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
            continue
          else:
            raise e
        else:
          break

100%|██████████| 6495/6495 [00:00<00:00, 910616.54it/s]
100%|██████████| 28424/28424 [00:02<00:00, 10822.43it/s]


## EDA

### EDA: Stratifying seed tweets by SAME type
- How much are different SAMEs tweeted?
- How much are different SAMEs interacted with (retweets, likes, replies)?
- How much do different SAMEs co-occur in the same tweet?
- What kind of messages are SAMEs shared in (start of warning, end of warning, sharing of opinion, etc...)

In [24]:
# load seed tweets
scraping_dir = DATA_DIR / "2021_12_12_CA/raw"
seed_tweet_dir = scraping_dir / "seed_tweets"
seed_tweets = []
for tweet_json in tqdm(list(sorted(seed_tweet_dir.glob("*.json")))):
  tweet = json.loads(tweet_json.read_text())
  seed_tweets.append(tweet)

100%|██████████| 28424/28424 [00:01<00:00, 27388.03it/s]


In [14]:
# gather stats
sames = [
  "storm warning",
  "snowfall warning",
  "rainfall warning",
  "freezing rain warning",
  "wind watch",
  "squall watch",
  "wind warning",
  "squall warning",
  "winter weather travel advisory"
]
same_stats = {
  k: dict(
    tweet_count=0,
    like_count=0,
    retweet_count=0,
    reply_count=0,
    quote_count=0,
    like_histo=[],
    retweet_histo=[],
    reply_histo=[],
    quote_histo=[],
    timestamp_histo=[],

  )
  for k in sames
}
same_cooccurrences = {
  k: {s: 0 for s in sames}
  for k in sames
}
for st in tqdm(seed_tweets):
  for same in sames:
    if same in st['text']:
      same_stats[same]['tweet_count'] += 1
      same_stats[same]['like_count'] += st['public_metrics']['like_count']
      same_stats[same]['retweet_count'] += st['public_metrics']['retweet_count']
      same_stats[same]['reply_count'] += st['public_metrics']['reply_count']
      same_stats[same]['quote_count'] += st['public_metrics']['quote_count']
      same_stats[same]['like_histo'] += [st['public_metrics']['like_count']]
      same_stats[same]['retweet_histo'] += [st['public_metrics']['retweet_count']]
      same_stats[same]['reply_histo'] += [st['public_metrics']['reply_count']]
      same_stats[same]['quote_histo'] += [st['public_metrics']['quote_count']]
      same_stats[same]['timestamp_histo'] += [st['created_at']]
      for cosame in sames:
        if cosame in st['text']:
          same_cooccurrences[same][cosame] += 1

100%|██████████| 28424/28424 [00:00<00:00, 184505.39it/s]


In [15]:
# visualize stats
import plotly.express as px
import plotly.graph_objects as go
color_seq = px.colors.qualitative.Pastel1
color_map = {s: color_seq[i] for i,s in enumerate(sames)}

In [16]:
# counts
fig_counts = go.Figure()
for s in sames:
  xx = ['tweet', 'like', 'retweet', 'quote']
  x = [f"# of {_x}s" for _x in xx]
  y = [same_stats[s][f"{_x}_count"] for _x in xx]
  fig_counts.add_trace(go.Bar(x=x, y=y, name=s, marker_color=color_map[s]))
fig_counts.update_layout(template='simple_white')
fig_counts.update_yaxes(type="log", title="Counts (log-scale)")
fig_counts.show()
fig_counts.update_yaxes(type="linear", title="Counts")
fig_counts.show()

In [17]:
# histograms
from plotly.subplots import make_subplots

rows = ['like', 'retweet', 'quote']
fig_histo = make_subplots(rows=len(rows), cols=1, subplot_titles=rows)
add_legend = True
for i,row in enumerate(rows):
  for s in sames:
    fig_histo.add_trace(
      go.Histogram(
        x=same_stats[s][f"{row}_histo"],
        name=s,
        marker_color=color_map[s],
        showlegend=add_legend,
        bingroup=i,
        nbinsx=100,
      ),
    i+1,  # subplot row
    1,    # subplot col
    )
  add_legend = False

fig_histo.update_layout(
  template='simple_white',
  barmode='overlay',
  autosize=False,
  width=1000,
  height=2000,
)
fig_histo.update_traces(opacity=0.75)
fig_histo.update_yaxes(type="log", title="Number of Tweets")
fig_histo.update_xaxes(type="linear", title="Tweet Interactions")
fig_histo.show()

In [18]:
# timeseries
fig_histo = go.Figure()
for s in sames:
  fig_histo.add_trace(
    go.Histogram(
      x=[_x[:-1] for _x in same_stats[s][f"timestamp_histo"]],
      name=s,
      marker_color=color_map[s],
      nbinsx=256,
    )
  )

fig_histo.update_layout(
  template='simple_white',
  barmode='overlay',
)
fig_histo.update_traces(opacity=0.75)
fig_histo.update_yaxes(type="log", title="Number of Tweets")
fig_histo.update_xaxes(title="Datetime (UTC-0)")
fig_histo.show()

## Blizzard Hashtags

In [26]:
from collections import defaultdict
import re

sames = [
  "storm warning",
  "snowfall warning",
  "rainfall warning",
  "freezing rain warning",
  "wind watch",
  "squall watch",
  "wind warning",
  "squall warning",
  "winter weather travel advisory"
]

# same -> hashtag -> counts/interactions
same_hashtags = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
hashtag_regex = r'\B#\w*[a-zA-Z]+\w*' # https://stackoverflow.com/a/54147208

for st in tqdm(seed_tweets):
  hashtags = re.findall(hashtag_regex, st['text'])
  for same in sames:
    if same in st['text']:
      for hashtag in hashtags:
        same_hashtags[same][hashtag]['tweet_count'] += 1
        same_hashtags[same][hashtag]['like_count'] += st['public_metrics']['like_count']
        same_hashtags[same][hashtag]['retweet_count'] += st['public_metrics']['retweet_count']
        same_hashtags[same][hashtag]['reply_count'] += st['public_metrics']['reply_count']
        same_hashtags[same][hashtag]['quote_count'] += st['public_metrics']['quote_count']

num_samples = 5
for same in same_hashtags.keys():
  print()
  print(same)
  print("-"*20)
  for k in sorted(same_hashtags[same].keys(), key=lambda x: same_hashtags[same][x]['tweet_count'])[-num_samples:]:
    print(f"{k}\n\t{json.dumps(same_hashtags[same][k])}")

100%|██████████| 28424/28424 [00:00<00:00, 168090.08it/s]


squall watch
--------------------
#Keswick
	{"tweet_count": 3, "like_count": 15, "retweet_count": 5, "reply_count": 1, "quote_count": 0}
#Sutton
	{"tweet_count": 3, "like_count": 15, "retweet_count": 5, "reply_count": 1, "quote_count": 0}
#nlwx
	{"tweet_count": 4, "like_count": 34, "retweet_count": 2, "reply_count": 2, "quote_count": 1}
#Georgina
	{"tweet_count": 4, "like_count": 17, "retweet_count": 8, "reply_count": 1, "quote_count": 1}
#ONStorm
	{"tweet_count": 5, "like_count": 18, "retweet_count": 7, "reply_count": 1, "quote_count": 1}

--------------------
#TwitchAffiliate
	{"tweet_count": 10, "like_count": 34, "retweet_count": 7, "reply_count": 1, "quote_count": 0}
#StreamRaiders
	{"tweet_count": 10, "like_count": 34, "retweet_count": 7, "reply_count": 1, "quote_count": 0}
#TwitchStreamer
	{"tweet_count": 10, "like_count": 34, "retweet_count": 7, "reply_count": 1, "quote_count": 0}
#YourStorm
	{"tweet_count": 10, "like_count": 34, "retweet_count": 7, "reply_count": 1, "quote_cou




### TF-IWF (Term-Frequency Inverse-Warning-Frequency)

In [27]:
from utils import DATA_DIR
import json
from tqdm import tqdm

scraping_dir = DATA_DIR / "2021_12_12_CA/raw"
seed_tweet_dir = scraping_dir / "seed_tweets"
seed_tweets = []
for tweet_json in tqdm(list(sorted(seed_tweet_dir.glob("*.json")))):
  tweet = json.loads(tweet_json.read_text())
  seed_tweets.append(tweet)

100%|██████████| 28424/28424 [00:00<00:00, 46578.38it/s]


In [28]:
sames = [
  "storm warning",
  "snowfall warning",
  "rainfall warning",
  "freezing rain warning",
  "wind watch",
  "squall watch",
  "wind warning",
  "squall warning",
  "winter weather travel advisory"
]

warning_tweets = {w: [] for w in sames}
for tweet in tqdm(seed_tweets):
  tweet_text = tweet['text']
  for w in sames:
    if w in tweet_text:
      warning_tweets[w].append(tweet_text)

100%|██████████| 28424/28424 [00:00<00:00, 391679.11it/s]


In [29]:
from utils.tf_iwf import tf_iwf, wtf, tokenizers

topk = 10

hf_iwf_dict = tf_iwf(warning_tweets, tokenizers['hashtag'])
tf_iwf_dict = tf_iwf(warning_tweets, tokenizers['ws'])
hf_wtf_dict = wtf(warning_tweets, tokenizers['hashtag'])
tf_wtf_dict = wtf(warning_tweets, tokenizers['ws'])

Iterating over tweets for wind watch: 100%|██████████| 137/137 [00:00<00:00, 135204.62it/s]
Iterating over tweets for squall watch: 100%|██████████| 139/139 [00:00<00:00, 165158.15it/s]
Iterating over tweets for winter weather travel advisory: 100%|██████████| 59/59 [00:00<00:00, 119953.43it/s]
Iterating over tweets for wind watch: 100%|██████████| 137/137 [00:00<00:00, 70488.18it/s]
Iterating over tweets for squall watch: 100%|██████████| 139/139 [00:00<00:00, 178453.71it/s]
Iterating over tweets for winter weather travel advisory: 100%|██████████| 59/59 [00:00<00:00, 68969.88it/s]


In [30]:
# TF-IWF scores for tokens and hashtags
tf_iwf_dict = {w: {k: v for (_, (k,v)) in zip(range(topk), d.items())}
               for w,d in tf_iwf_dict.items()}
hf_iwf_dict = {w: {k: v for (_, (k,v)) in zip(range(topk), d.items())}
               for w,d in hf_iwf_dict.items()}
print(json.dumps(tf_iwf_dict, indent=4))
print(json.dumps(hf_iwf_dict, indent=4))

{
        "\u2744\ufe0fWinter": 0.012992446497158666,
        "Sun": 0.0056680194878042794,
        "storm": 0.005044514672618384,
        "effect:": 0.0040882742713506965,
        "Mon": 0.003908086496157289,
        "Fri": 0.003614207841857802,
        "Sat": 0.0034005271976655324,
        "Winter": 0.0031156238127350356,
        "Philippines": 0.0025881367524220452,
        "Thu": 0.0023031716639289913
    },
        "Vancouver": 0.0051737306031616305,
        "snowfall": 0.0044277160824144155,
        "Base": 0.003239151711060766,
        "Edmonton": 0.0026992930925506383,
        "Fraser": 0.002217313115640699,
        "province.": 0.002217313115640699,
        "\u2744\ufe0f": 0.002217313115640699,
        "camp": 0.002217313115640699,
        "statement,": 0.002217313115640699,
        "@50ShadesofVan": 0.0021594344740405104
    },
        "rainfall": 0.03260276687694253,
        "na": 0.011027663412768347,
        "sa": 0.010068736159484143,
        "red": 0.008530665328383438,


In [31]:
# TF-WTF scores for tokens and hashtags
tf_wtf_dict = {w: {k: v for (_, (k,v)) in zip(range(topk), d.items())}
               for w,d in tf_wtf_dict.items()}
hf_wtf_dict = {w: {k: v for (_, (k,v)) in zip(range(topk), d.items())}
               for w,d in hf_wtf_dict.items()}
print(json.dumps(tf_wtf_dict, indent=4))
print(json.dumps(hf_wtf_dict, indent=4))

{
        "storm": 0.8774619859326536,
        "winter": 0.16952184252162464,
        "\u2744\ufe0fWinter": 0.1403018446059251,
        "Winter": 0.10636109558412521,
        "Philippines": 0.02627166014533259,
        "phone": 0.023710402709760305,
        "Sun": 0.020363391574556577,
        "hivernale": 0.01956400223588597,
        "temp\u00eate": 0.01956400223588597,
        "Fri": 0.01839196523557944
    },
        "snowfall": 0.4455429857084006,
        "Vancouver": 0.05211349160393746,
        "Edmonton": 0.03184713375796178,
        "province.": 0.027297543221110096,
        "@50ShadesofVan": 0.025477707006369428,
        "Princeton": 0.025477707006369428,
        "#yeg": 0.021231422505307858,
        "season's": 0.01910828025477707,
        "@BCWeather101": 0.01910828025477707,
        "5:15pm": 0.01910828025477707
    },
        "rainfall": 0.590790255496138,
        "sa": 0.061660079051383404,
        "na": 0.047999999999999994,
        "kami": 0.03181818181818182,
        "