# Code used to prepare Exercises for Bootcamp (Just for reference)

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import pathlib
import datetime
import numpy as np
import xarray
import matplotlib
import a2.plotting

sys.path.append("../scripts")
import normalize_text_bootcamp
import dataset_bootcamp
import utils_bootcamp
import plotting

import re

In [None]:
FOLDER_DATA = "../../data/bootcamp2023/"
FOLDER_TWEETS = FOLDER_DATA + "tweets/"
FOLDER_PRECIPITATION = FOLDER_DATA + "precipitation/"

FILE_TWEETS = (
    FOLDER_TWEETS
    + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
)


def load_tweets(folder):
    ds = xarray.load_dataset(folder)
    ds = dataset_bootcamp.reset_index_coordinate(ds)
    return ds

In [None]:
ds = load_tweets(FILE_TWEETS)

In [None]:
ds

In [None]:
ds["tp_h"][ds["tp_h"] == ds["tp_h_mm"] * 1e3]
ds["tp_h_mm"][ds["tp_h"] != ds["tp_h_mm"] * 1e-3]

In [None]:
ds["tp_h_mm"] = (["index"], ds["tp_h"].values.copy() * 1e3)
ds["raining"] = (["index"], np.array(ds["tp_h_mm"].values >= 0.1, int))

ds["tp_mm_station"] = (["index"], ds["station_tp_mm"].values.copy() * 1e3)
ds["raining_station"] = (["index"], np.array(ds["station_tp_mm"].values >= 0.1, int))

ds["raining_radar"] = (["index"], np.array(ds["tp_mm_radar"].values >= 0.1, int))

In [None]:
ds["raining"].plot.hist(bins=2);

In [None]:
ds["raining_station"].plot.hist(bins=2);

In [None]:
ds["raining_radar"].plot.hist(bins=2);

In [None]:
a2.plotting.histograms.plot_histogram_2d("raining_radar", "raining", df=ds, n_bins=2, annotate=True);

In [None]:
a2.plotting.histograms.plot_histogram_2d("raining_station", "raining", df=ds, n_bins=2, annotate=True);

In [None]:
sorted(list(ds.keys()))

In [None]:
ds = dataset_bootcamp.reset_index_coordinate(
    ds.drop_vars(
        [
            "withheld.copyright",
            "withheld.country_codes",
            "withheld.scope",
            "time",
            "reply_settings",
            "prediction",
            "prediction_probability_not_raining",
            "prediction_probability_raining",
            # "created_at_h",
            "created_at_rounded_5",
            "difference_tp",
            "inconsistent_rain",
            "difference_tp",
            "station_tp_mm",
            "time_radar",
            "time_radar_int",
            # "time_half",
            "x_ngt",
            "x_ngt_rounded",
            "y_ngt",
            "y_ngt_rounded",
            "bounding_box",
        ],
        errors="ignore",
    )
)
ds

In [None]:
sorted(list(ds.keys()))

In [None]:
ds["text_original"] = (["index"], ds.text.values.copy())

In [None]:
def generate_datasets(ds, subfix):
    ds_17 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2018-01-01T00:00:00")),
        drop=True,
    )
    ds_17.to_netcdf(FOLDER_TWEETS + f"tweets_2017_{subfix}.nc")
    ds_17_01 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2017-02-01T00:00:00")),
        drop=True,
    )
    ds_17_01.to_netcdf(FOLDER_TWEETS + f"tweets_2017_01_{subfix}.nc")
    ds.to_netcdf(FOLDER_TWEETS + f"tweets_2017-2020_{subfix}.nc")

In [None]:
# ds_norm = normalize_text_bootcamp.normalize_text_dataset(ds)
# generate_datasets(ds_norm, "era5_normed")

In [None]:
ds_norm_filtered = normalize_text_bootcamp.normalize_filter_dataset(ds)

In [None]:
generate_datasets(ds_norm_filtered, "era5_normed_filtered")

### Check small dataset

In [None]:
ds_tmp = load_tweets(FOLDER_TWEETS + "tweets_2017_01_era5_normed_filtered.nc")

In [None]:
sorted(list(ds_tmp.keys()))

### **Precipitation**

In [None]:
ds_p = xarray.load_dataset(FOLDER_PRECIPITATION + "ds_prec_era5_uk_2017-2020.nc")

In [None]:
ds_tmp.sel(index=slice(10))

### Precipitation map at location of Tweets

In [None]:
a2.plotting.weather_maps.plot_precipiation_map(ds_p, ds_tmp.sel(index=slice(10, 20)), key_time="time_half");

In [None]:
def split_dataset_by_year(
    ds,
    prefix="ds_precipitation_",
    years=[2017, 2021],
    folder=FOLDER_PRECIPITATION,
):
    years_range = np.arange(*years)
    for y in years_range:
        ds_y = ds.where(
            (ds.time > np.datetime64(f"{y}-01-01T00:00:00")) & (ds.time < np.datetime64(f"{y+1}-01-01T00:00:00")),
            drop=True,
        )
        ds_y.to_netcdf(f"{folder}{prefix}{y}.nc")


split_dataset_by_year(ds_p)
ds_p

In [None]:
ds_17 = xarray.load_dataset(FOLDER_PRECIPITATION + "ds_prec_era5_uk_2017-2020.nc")

## Filtered (normalized) dataset version

In [None]:
ds_tweets_full = load_tweets("../../data/tweets/tweets_2017-2020_normalized.nc")

In [None]:
ds_tweets_fn = normalize_text_bootcamp.normalize_filter_dataset(ds_tweets_full)

In [None]:
ds_tweets_fn.to_netcdf("../../data/tweets/tweets_2017-2020_normalized_filtered.nc")

In [None]:
ds_tweets_fn

In [None]:
# small dataset

In [None]:
ds_tweets_17 = load_tweets("../../data/tweets/tweets_2017_normalized.nc")

In [None]:
ds_tweets_17_fn = normalize_text_bootcamp.normalize_filter_dataset(ds_tweets_17)

In [None]:
ds_tweets_17_fn.to_netcdf("../../data/tweets/tweets_2017_normalized_filtered.nc")

In [None]:
ds_tweets_17_fn

In [None]:
def drop_var_save(drop_variables=["text"]):
    FOLDER_TWEETS = "../../data/tweets/"
    folders = [
        FOLDER_TWEETS + x
        for x in [
            "tweets_2017-2020_normalized_filtered.nc",
            "tweets_2017-2020_normalized.nc",
            "tweets_2017_normalized.nc",
            "tweets_2017_normalized_filtered.nc",
        ]
    ]
    for f in folders:
        ds_tweets = load_tweets(f)
        for var in drop_variables:
            if var in ds_tweets.variables.keys():
                ds_tweets.drop([var])
        ds_tweets.to_netcdf(f)


drop_var_save()

## ERA5 dataset preprocess

In [None]:
ds_p.sel(time=np.datetime64("2020-01-01T02:30:00.000000000")).tp.plot(
    xlim=[-9, 3],
    ylim=[49, 61],
    norm=matplotlib.colors.LogNorm(vmax=1e-3, vmin=1e-6),
    cmap="ocean_r",
)