# Visualize data for Tweets paper
Plots included:
* Histogram of keyword occurence with rendered emojis
* Optional, word cloud (rendering emojis causes issues)

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import glob
import zipfile
import pathlib
import logging
import json
import string

import numpy as np
import pandas as pd
import matplotlib.font_manager
import matplotlib.pyplot as plt
import matplotlib

import matplotlib.offsetbox


import wget

import requests
import base64
from bs4 import BeautifulSoup

import wordcloud
import tweepy
import a2.twitter.downloader
import a2.plotting
import a2.dataset
import a2.utils

In [None]:
FOLDER_DATA = a2.utils.file_handling.get_folder_data()
FOLDER_TWEETS = FOLDER_DATA / "tweets/"
FILE_TWEETS = (
    FOLDER_TWEETS
    / "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
)
FOLDER_FIGURES = pathlib.Path("../../figures/data/tweets/")
FILE_FONT = "../fonts/noto/NotoEmoji-VariableFont_wght.ttf"

In [None]:
ds_tweets = a2.dataset.load_dataset.load_tweets_dataset(FILE_TWEETS)

In [None]:
ds_tweets

In [None]:
header = "🏔️ OR 🏔️ OR ☀️ OR ☀️ OR 🌞 OR ⛅ OR ⛈️ OR ⛈️ OR 🌤️ OR 🌤️ OR 🌥️ OR 🌥️ OR 🌦️ OR 🌦️ OR 🌧️ OR 🌧️ OR 🌨️ OR 🌨️ OR 🌩️ OR 🌩️ OR ☔ OR ⛄ OR blizzard OR cloudburst OR downpour OR drizzle OR flash flood OR flood OR flood stage OR forecast OR freezing rain OR hail OR ice storm OR lightning OR precipitation OR rain OR rain gauge OR rain shadow OR rainbands OR rain shower OR snow OR snow shower OR snowstorm OR sun OR sunny OR thunder OR thunderstorm"
keywords = header.split(" OR ")

header_no_alts = "🏔️ OR ☀️ OR 🌞 OR ⛅ OR ⛈️ OR 🌤️ OR 🌥️ OR 🌦️ OR 🌧️ OR 🌨️ OR 🌩️ OR ☔ OR ⛄ OR blizzard OR cloudburst OR downpour OR drizzle OR flash flood OR flood OR flood stage OR forecast OR freezing rain OR hail OR ice storm OR lightning OR precipitation OR rain OR rain gauge OR rain shadow OR rainbands OR rain shower OR snow OR snow shower OR snowstorm OR sun OR sunny OR thunder OR thunderstorm"
keywords_no_alts = header_no_alts.split(" OR ")
keywords_no_alts

In [None]:
text_normalized = " ".join(ds_tweets.text_normalized.values)
text_original = " ".join(ds_tweets.text_original.values)

In [None]:
!ls $FILE_FONT

In [None]:
def plot_wordcloud():
    normal_word = r"(?:\w[\w']+)"
    # 2+ consecutive punctuations, e.x. :)
    ascii_art = r"(?:[{punctuation}][{punctuation}]+)".format(punctuation=string.punctuation)
    # a single character that is not alpha_numeric or other ascii printable
    emoji = r"(?:[^\s])(?<![\w{ascii_printable}])".format(ascii_printable=string.printable)
    regexp = r"{normal_word}|{ascii_art}|{emoji}".format(normal_word=normal_word, ascii_art=ascii_art, emoji=emoji)
    dir_name = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    # Generate a word cloud image
    # The Symbola font includes most emoji
    wc = wordcloud.WordCloud(font_path=FILE_FONT, regexp=regexp, width=800, height=400).generate(text_original)

    plt.imshow(wc)
    plt.axis("off")
    plt.show()


plot_wordcloud()

In [None]:
plt.close("all")
%matplotlib inline

occurence = [text_original.count(k) for k in keywords]

# matplotlib.use("module://mplcairo.tk")
prop = matplotlib.font_manager.FontProperties(fname=FILE_FONT)
plt.rcParams["axes.unicode_minus"] = False  # These two lines need to be set manually
plt.rcParams["font.family"] = prop.get_family()
fig, axs = plt.subplots(2, 1, figsize=(20, 10), constrained_layout=True)
for i, log in enumerate([True, False]):
    ax = axs[i]
    plot = ax.bar(np.arange(len(occurence)), occurence)
    labels = ["{}".format(x) for x in keywords]
    for rect1, label in zip(plot, labels):
        height = rect1.get_height()
        ax.annotate(
            label,
            (rect1.get_x() + rect1.get_width() / 2, height + 5),
            ha="center",
            va="bottom",
            fontsize=16,
            fontproperties=prop,
            rotation=90,
        )
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel("keywords")
    ax.set_ylabel("counts")
    if log:
        ax.set_yscale("log")
plt.draw()
plt.show()
fig.savefig(FOLDER_FIGURES / "word_count_all.pdf")

In [None]:
# Downloading emoji images taken from https://rémy.be/posts/plot-emoji-with-matplotlib/


def download_emoji(link, to_folder="emoji_images/"):
    output = {}
    response = requests.get(emoji_url)
    response.raise_for_status()
    html_code = BeautifulSoup(response.text, "html.parser")
    table = html_code.find("table")
    emoji = [n.img["src"] for n in table.find_all(attrs={"class": "andr alt"})]
    codes = [c.text for c in table.find_all(attrs={"class": "code"})]
    emojilist = []

    for row in table.findAll(["tr"])[3:]:
        code = row.find_all("td", attrs={"class": "code"})
        image = row.find_all("td", attrs={"class": "andr alt"})
        if code and image:
            emojilist.append((code[0].text, image[0].img["src"]))

    prefixlen = len("data:image/png;base64,")
    for code, data in emojilist:
        code = code[2:]
        code = code.replace(" U+", "_")
        filename = os.path.join(to_folder, f"{code}.png".lower())
        os.makedirs(filename, exist_ok=True)
        with open(filename, "wb") as fh:
            fh.write(base64.decodebytes(bytes(data[prefixlen:], "utf-8")))
        print("Wrote to:", filename)

In [None]:
emoji_url = "http://www.unicode.org/emoji/charts/full-emoji-list.html"
emoji_modifiers_url = "http://www.unicode.org/emoji/charts/full-emoji-modifiers.html"

download_emoji(emoji_url)
# execution only required once, takes a while...
# download_emoji(emoji_modifiers_url) # optional

In [None]:
def plot_emojis():
    _x = data["A"]
    _y = data["B"]

    f, ax = plt.subplots(figsize=(7, 7))

    ax.set_xlim(0, 700)
    ax.set_ylim(0, 700)

    for png, x, y in zip(emoji_png, _x, _y):
        emoj = plt.imread(f"emoji_images/{png}")
        imagebox = matplotlib.offsetbox.OffsetImage(emoj, zoom=0.5)
        ab = matplotlib.offsetbox.AnnotationBbox(imagebox, (int(x), int(y)), frameon=False)
        ax.add_artist(ab)

    plt.title("Relative usage")
    plt.xlabel("A")
    plt.ylabel("B")

In [None]:
occurence = [text_original.count(k) for k in keywords_no_alts]

In [None]:
occurence = np.array(occurence)
keywords_no_alts = np.array(keywords_no_alts)
min_occurence = 5000
mask_occurence = occurence > min_occurence
occurence_masked = occurence[mask_occurence]
keywords_masked = keywords_no_alts[mask_occurence]
prop = matplotlib.font_manager.FontProperties()
plt.rcParams["axes.unicode_minus"] = False  # These two lines need to be set manually
plt.rcParams["font.family"] = prop.get_family()
fontsize = 13
a2.plotting.utils_plotting.set_font(fontsize)
logs = [False]
fig, axs = plt.subplots(len(logs), 1, figsize=(8, 5), constrained_layout=True, squeeze=False)
for i, log in enumerate(logs):
    ax = axs[i][0]
    plot = a2.plotting.histograms.plot_bar(
        bin_centers=range(len(occurence_masked)),
        hist=occurence_masked,
        width_bars=1,
        xlim=None,
        ylim=None,
        ax=ax,
        log=[False, log],
        linear_thresh=None,
        label_x=None,
        label_y=None,
        vertical=False,
        alpha=1,
        font_size=fontsize,
        replace_x_labels_at=None,
        replace_x_labels_with=None,
        replace_y_labels_at=None,
        replace_y_labels_with=None,
    )
    print(keywords_masked)
    a2.plotting.histograms.annotate_histogram(ax, plot, keywords_masked, as_label="x", fontsize=fontsize)
    ax.set_xlim([-0.5, len(occurence_masked) - 0.5])
    a2.plotting.axes_utils.set_axes(ax=ax, label_y="Number of keyword occurence", fontsize=fontsize)
fig.savefig(FOLDER_FIGURES / "occurence_keywords_min5000.pdf")

In [None]:
keywords_masked[0]