# Reduces tweet text of large dataset 
- Includes work arounds to remain within local memory limit (32 GB) for large dataset (~3M tweets from years 2017-2020)
- Weather bots are removed by removing tweets from sources with mostly activity from weather bots
- Tweets are normalized such that
    - emojis including keywords are replaced by their string description
    - remaining emojis are removed
    - model is case sensitive, so cases remain
    - only tweets with keywords after normalization are retained

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import re
import os
import xarray
import logging

logging.basicConfig(level=logging.INFO)

import torch
import torch.nn.functional

import a2.dataset.load_dataset
import a2.dataset.utils_dataset
import a2.preprocess
import a2.utils
import a2.plotting

In [None]:
FOLDER_DATA = "/p/project/deepacf/maelstrom/ehlert1/data/tweets/"
FOLDER_DATA = "../data/tweets/"
FILE_DATA = FOLDER_DATA + "gb_2020_rain_sun_vocab_emojisfixed_location_precipitation.nc"
figure_path = pathlib.Path("../figures/data/gb_2017-2020_rain_sun_vocab_emojis/")

In [None]:
header = "🏔️ OR 🏔️ OR ☀️ OR ☀️ OR 🌞 OR ⛅ OR ⛈️ OR ⛈️ OR 🌤️ OR 🌤️ OR 🌥️ OR 🌥️ OR 🌦️ OR 🌦️ OR 🌧️ OR 🌧️ OR 🌨️ OR 🌨️ OR 🌩️ OR 🌩️ OR ☔ OR ⛄ OR blizzard OR cloudburst OR downpour OR drizzle OR flash flood OR flood OR flood stage OR forecast OR freezing rain OR hail OR ice storm OR lightning OR precipitation OR rain OR rain gauge OR rain shadow OR rainbands OR rain shower OR snow OR snow shower OR snowstorm OR sun OR sunny OR thunder OR thunderstorm"
keywords = header.split(" OR ")

## Load Data

In [None]:
np.unique(ds_raw.source)

In [None]:
ds_raw = a2.dataset.load_dataset.load_tweets_dataset(
    FOLDER_DATA + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5.nc",
    raw=True,
)

In [None]:
ds_raw["author_id"] = (["index"], ds_raw["author_id"].astype(int).values)

## Filter out bots based on source of tweet frequently used for weather bots

In [None]:
sources_not_bots = [
    "Twitter for iPhone",
    "Twitter for Android",
    "Instagram",
    "Twitter Web Client",
    "Twitter for iPad",
    "Tweetbot for iΟS",
    "Untappd",
    "Foursquare",
    "Tweetbot for Mac",
    "TweetCaster for Android",
    "Twitter for Windows Phone",
    "Echofon",
    "Hootsuite",
]
sources_bots = [
    "pywws",
    "Sandaysoft Cumulus",
    "MeteoWare Plus+",
    "dlvr.it",
    "World Cities",
    "SunTracker",
    "BordersWeatherWeb",
    "Wolfson College Solar Panels",
    "wezzoo",
    "Trendsmap Alerting",
    "Weather Clock",
    "Rude Weather App",
    "Bowvie Weather",
    "Make \(formerly Integromat\)",
    "Duukkis",
    "Weather Station",
    "NPTOHC API",
    "Beep!Beep!",
]

In [None]:
%%time
group_by_source = a2.dataset.utils_dataset.print_tweet_groupby(
    ds_raw.where(ds_raw.source.str.contains("|".join(sources_bots)), drop=True),
    "source",
    n_groups=10,
)

In [None]:
%time
ds_no_bots = ds_raw.where(~ds_raw.source.str.contains("|".join(sources_bots)), drop=True)
ds_no_bots = a2.dataset.load_dataset.reset_index_coordinate(ds_no_bots)

In [None]:
a2.dataset.load_dataset.save_dataset(
    ds_no_bots,
    FOLDER_DATA + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots.nc",
)

## Check for bot terms

In [None]:
bot_terms = [
    "Hum",
    "Press",
    "Barometer",
    "Pressure",
    "Humidity",
    "Baro",
    "Humid",
    "Hum.",
    "Pressure:",
]

In [None]:
ds_no_bots.where(
    ds_no_bots.text.str.contains(r"\b(?:" + "|".join(bot_terms) + r")\b", flags=re.IGNORECASE),
    drop=True,
)

In [None]:
%%time
ds_no_bots = a2.dataset.load_dataset.load_tweets_dataset(
    "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots.nc",
    raw=True,
)

In [None]:
%%time
group_by_source = a2.dataset.utils_dataset.print_tweet_groupby(
    ds_no_bots, "source", n_groups=10
)  # ,ds_grouped=group_by_source)

In [None]:
%%time
# very expensive
group_by_author = a2.dataset.utils_dataset.print_tweet_groupby(ds_no_bots, "author_id", n_groups=50)

In [None]:
%%time
with xarray.set_options(use_flox=True):
    group_by_author_flox = a2.dataset.utils_dataset.print_tweet_groupby(ds_no_bots, "author_id", n_groups=50)

## Reload dataset

In [None]:
%%time
ds_no_bots = a2.dataset.load_dataset.load_tweets_dataset(
    FOLDER_DATA + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots.nc",
    # "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots.nc",
    raw=True,
)

In [None]:
ds_no_bots["text"] = (["index"], ds_no_bots.text_original.values.copy())

In [None]:
emojis, emojis_dic = a2.preprocess.normalize_text.get_emojis_and_dictionary("keywords", keywords=keywords)

In [None]:
ds_emojis = ds_no_bots.where(
    ds_no_bots.text.str.contains("|".join(emojis_dic.keys())),
    drop=True,
)
ds_emojis

## Combine filtering and normalization

In [None]:
ds_no_bots["text_original"] = (["index"], ds_no_bots.text.values.copy())

In [None]:
ds_no_bots_nf = a2.preprocess.normalize_text.normalize_filter_dataset(ds_no_bots)

In [None]:
ds_no_bots_nf

In [None]:
a2.dataset.load_dataset.save_dataset(
    ds_no_bots_nf,
    "../data/tweets/2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered.nc",
)

## Thorough walkthrough of filtering and normalization

In [None]:
%%time
normalizer = a2.preprocess.normalize_text.Normalizer(keywords=keywords)
logging.info(normalizer.emojis_dic)
normalized_text = a2.utils.utils.parallelize(
    function=normalizer.normalize,
    args=zip(ds_no_bots.text.values, ds_no_bots.source.values),
    kwargs_as_dict=dict(
        ignore_non_ascii=True,
        remove_punctuations="keep_stopwords",
        replace_keyword_emojis=False,
    ),
)

In [None]:
ds_no_bots["text_original"] = (["index"], ds_no_bots.text.values.copy())
index_max = ds_no_bots.index.shape[0]
indices = np.linspace(0, index_max, 24, dtype=int)

for index_start, index_end in zip(indices[:-1], indices[1:]):
    mask = (ds_no_bots.index >= index_start) & (ds_no_bots.index <= index_end)
    ds_no_bots["text"].loc[mask] = normalized_text[index_start : index_end + 1]

In [None]:
emojis, emojis_dic = a2.preprocess.normalize_text.get_emojis_and_dictionary(
    scope="keywords", keywords=keywords, add_space_name=True
)

In [None]:
ds_no_bots

In [None]:
ds_emojis = ds_no_bots.where(
    ds_no_bots.text_original.str.contains("|".join(normalizer.emojis_dic.keys())),
    drop=True,
)

In [None]:
ds = ds_emojis.where(ds_emojis.source.str.contains("Instagram"), drop=True)
ds = a2.dataset.load_dataset.reset_index_coordinate(ds)
for index in a2.utils.utils.get_random_indices(10, ds.index.shape[0]):
    print(f"index: {index}")
    print("-----------")
    print(ds.text_original.values[index])
    print("___________")
    print(a2.preprocess.normalize_text.remove_instagram_atsign(ds.text_original.values[index]))
    print("___________")
    print(ds.text.values[index])
    print("___________")

In [None]:
ds_at_sign = ds_no_bots.where(ds_no_bots.text_original.str.contains("@"), drop=True)

In [None]:
ds_at_sign = a2.dataset.load_dataset.reset_index_coordinate(ds_at_sign)
for index in a2.utils.utils.get_random_indices(20, ds_at_sign.index.shape[0]):
    print(f"index: {index}")
    print(ds_at_sign.source.values[index])
    print("-----------")
    print(ds_at_sign.text_original.values[index])
    print("---------------->>>")
    print(ds_at_sign.text.values[index])

## Include only tweets containing at least one keyword (including converted hashtags/emojis)

In [None]:
ds_no_keywords = ds_no_bots.where(
    ~ds_no_bots.text.str.contains("|".join(keywords), flags=re.IGNORECASE),
    drop=True,
)
print(f"{ds_no_keywords.index.shape[0]} Tweets have no keyword/emoji after normalizing!")

In [None]:
ds_no_keywords = a2.dataset.load_dataset.reset_index_coordinate(ds_no_keywords)
for index in a2.utils.utils.get_random_indices(10, ds_no_keywords.index.shape[0]):
    print("-----------")
    print(ds_no_keywords.text.values[index])
    print(ds_no_keywords.text_original.values[index])

In [None]:
ds_keywords = ds_no_bots.where(
    ds_no_bots.text.str.contains("|".join(keywords), flags=re.IGNORECASE),
    drop=True,
)

In [None]:
ds_keywords = a2.dataset.load_dataset.reset_index_coordinate(ds_keywords)

In [None]:
a2.dataset.load_dataset.save_dataset(
    ds_keywords,
    "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots_normalized.nc",
    add_attributes="normalized tweets, kept keywords in hashtags, converted emojis to text, only texts with at least on keyword remain, remove_punctuations=keep_stopwords",
)

## Only keep tweets with unique text

In [None]:
_, indices, indices_inverse = np.unique(ds_keywords.text.values, return_index=True, return_inverse=True)

In [None]:
ds_unique = ds_keywords.sel(index=indices)

In [None]:
ds_unique = a2.dataset.load_dataset.reset_index_coordinate(ds_unique)

In [None]:
plt.close("all")
%matplotlib inline

text = " ".join(ds_unique.text.values)
keywords_extended = ["☀"] + keywords
occurence = []
for k in keywords_extended:
    occurence.append(text.count(k))

import matplotlib

# matplotlib.use("module://mplcairo.tk")
# prop = matplotlib.font_manager.FontProperties(
#     fname=os.path.join(dir_name, "fonts", "Symbola.ttf")
# )
# plt.rcParams["axes.unicode_minus"] = False  # These two lines need to be set manually
# plt.rcParams["font.family"] = prop.get_family()
fig, axs = plt.subplots(2, 1, figsize=(20, 10), constrained_layout=True)
for i, log in enumerate([True, False]):
    ax = axs[i]
    plot = ax.bar(np.arange(len(occurence)), occurence)
    labels = ["{}".format(x) for x in keywords_extended]
    for rect1, label in zip(plot, labels):
        height = rect1.get_height()
        ax.annotate(
            label,
            (rect1.get_x() + rect1.get_width() / 2, height + 5),
            ha="center",
            va="bottom",
            fontsize=16,
            # fontproperties=prop,
            rotation=90,
        )
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel("keywords")
    ax.set_ylabel("counts")
    if log:
        ax.set_yscale("log")
fig.savefig("word_count_tokenized.pdf")

In [None]:
filename = figure_path / f"tweets_reduced_2017-2020_tp_histogram.pdf"
a2.plotting.histograms.plot_histogram(
    ds_unique.tp.values,
    log=["symlog", "log"],
    linear_thresh=1e-9,
    n_bins=100,
    label_x="tp",
    filename=filename,
)

In [None]:
filename = figure_path / f"tweets_reduced_2017-2020_tp_histogram.pdf"
a2.plotting.histograms.plot_histogram_2d(
    ds_unique.tp.values,
    ds_unique.tp.values > 1e-8,
    log=["symlog", False],
    linear_thresh=1e-9,
    n_bins=[10, 2],
    label_x="tp",
    filename=filename,
);

In [None]:
ds_no_negative_tp = ds_unique.where(~(ds_unique.tp < 0), drop=True)

In [None]:
filename = figure_path / f"tweets_reduced_2017-2020_tp_histogram.pdf"
a2.plotting.histograms.plot_histogram_2d(
    ds_unique.tp.values,
    ds_unique.raining.values,
    log=["symlog", False],
    linear_thresh=1e-9,
    n_bins=[10, 2],
    label_x="tp",
    filename=filename,
);

In [None]:
# update `raining` field
ds_no_negative_tp["raining"] = (
    ["index"],
    np.array(ds_no_negative_tp.tp.values > 1e-8, int),
)

In [None]:
ds_no_negative_tp.raining.plot.hist()

In [None]:
print(f"in total {ds_no_negative_tp.index.shape[0]} Tweets after reduction")

In [None]:
a2.dataset.load_dataset.save_dataset(
    ds_no_negative_tp,
    "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots_normalized_unique.nc",
    add_attributes=", only unique Tweet texts, remove tweets with negative TP",
)

In [None]:
ds_no_negative_tp = a2.dataset.load_dataset.load_tweets_dataset(
    "../data/tweets/2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots_normalized_unique.nc",
    raw=True,
)

In [None]:
ds_no_negative_tp["raining_old"] = (
    "index",
    np.abs(ds_no_negative_tp["tp_cum"].values) > 1e-8,
)

In [None]:
ds_no_negative_tp["raining"] = (
    "index",
    np.abs(ds_no_negative_tp["tp"].values) > 1e-8,
)

In [None]:
x = ds_no_negative_tp.raining.values
y = ds_no_negative_tp.raining_old.values
a2.plotting.histograms.plot_histogram_2d(
    x,
    y,
    log=False,
    bins=[[0, 0.5, 1]] * 2,
    label_x="raining",
    label_y="raining_old",
)

In [None]:
alpha_numerics = re.compile("[" "\U00002460-\U000024FF" "]+", re.UNICODE)
ds_no_negative_tp.where(ds_no_negative_tp.text_original.str.contains(alpha_numerics), drop=True)

In [None]:
a2.dataset.load_dataset.save_dataset(
    ds_no_negative_tp,
    "../data/tweets/2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots_normalized_unique.nc",
)

In [None]:
m = ds_no_negative_tp.source == "Instagram"

In [None]:
m

In [None]:
ds_no_negative_tp.sel(index=m)

In [None]:
grouped_by_raining = a2.dataset.utils_dataset.print_tweet_groupby(
    ds_no_negative_tp,
    "raining",
    n_sample=20,
    fields_to_print=["text", "text_original", "created_at"],
)

## Dataset completely without emojis

In [None]:
%%time
ds_no_bots = a2.dataset.load_dataset.load_tweets_dataset(
    "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_precipitation_no_bots.nc",
    raw=True,
)

In [None]:
ds_no_bots