# Code used to prepare Exercises for Bootcamp (Just for reference)

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import pathlib
import datetime
import numpy as np
import xarray
import matplotlib
import a2.plotting

sys.path.append("../scripts")
import normalize_text_bootcamp
import dataset_bootcamp
import utils_bootcamp
import plotting

import re

In [None]:
FOLDER_DATA = "../../data/bootcamp2023/"
FOLDER_TWEETS = FOLDER_DATA + "tweets/"
FOLDER_PRECIPITATION = FOLDER_DATA + "precipitation/"

FILE_TWEETS = (
    FOLDER_TWEETS
    + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
)


def load_tweets(folder):
    ds = xarray.load_dataset(folder)
    ds = dataset_bootcamp.reset_index_coordinate(ds)
    return ds


def save_tweets(ds, filename):
    ds = dataset_bootcamp.reset_index_coordinate(ds)
    ds.to_netcdf(filename)

In [None]:
ds = load_tweets(FILE_TWEETS)

In [None]:
tp = ds.tp_h.values
tp_cleaned = np.zeros_like(tp)
tp_cleaned[tp > 0] = tp[tp > 0]
ds["tp_h"] = (["index"], tp_cleaned)

In [None]:
tp_mm_rain_thresh = 6e-3

ds["tp_h_mm"] = (["index"], ds["tp_h"].values.copy() * 1e3)
ds["raining"] = (["index"], np.array(ds["tp_h_mm"].values >= tp_mm_rain_thresh, int))

ds["tp_mm_station"] = (["index"], ds["station_tp_mm"].values.copy() * 1e3)
ds["raining_station"] = (["index"], np.array(ds["station_tp_mm"].values >= 0.1, int))

ds["raining_radar"] = (["index"], np.array(ds["tp_mm_radar"].values >= 0.1, int))

In [None]:
a2.plotting.histograms.plot_histogram(
    ds.tp_h.values[~np.isnan(ds.tp_h.values)], log=["symlog", "log"], symlog_linear_threshold=1e-8
)

In [None]:
ds["raining"].plot.hist(bins=2);

In [None]:
ds["raining_station"].plot.hist(bins=2);

In [None]:
ds["raining_radar"].plot.hist(bins=2);

In [None]:
a2.plotting.histograms.plot_histogram_2d("raining_radar", "raining", df=ds, n_bins=2, annotate=True);

In [None]:
a2.plotting.analysis.plot_confusion_matrix(
    truth=ds.raining.values, prediction=ds.raining_station.values, normalize="all"
);

In [None]:
a2.plotting.histograms.plot_histogram_2d("raining_station", "raining", df=ds, n_bins=2, annotate=True);

In [None]:
sorted(list(ds.keys()))

In [None]:
ds = dataset_bootcamp.reset_index_coordinate(
    ds.drop_vars(
        [
            "withheld.copyright",
            "withheld.country_codes",
            "withheld.scope",
            "time",
            "reply_settings",
            "prediction",
            "prediction_probability_not_raining",
            "prediction_probability_raining",
            # "created_at_h",
            "created_at_rounded_5",
            "difference_tp",
            "inconsistent_rain",
            "difference_tp",
            "station_tp_mm",
            "time_radar",
            "time_radar_int",
            # "time_half",
            "x_ngt",
            "x_ngt_rounded",
            "y_ngt",
            "y_ngt_rounded",
            "bounding_box",
        ],
        errors="ignore",
    )
)
ds

In [None]:
sorted(list(ds.keys()))

In [None]:
ds["text_original"] = (["index"], ds.text.values.copy())

In [None]:
def generate_datasets(ds, subfix):
    ds_17 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2018-01-01T00:00:00")),
        drop=True,
    )
    ds_17_01 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2017-02-01T00:00:00")),
        drop=True,
    )
    save_tweets(ds_17_01, FOLDER_TWEETS + f"tweets_2017_01_{subfix}.nc")
    save_tweets(ds_17, FOLDER_TWEETS + f"tweets_2017_{subfix}.nc")
    save_tweets(ds, FOLDER_TWEETS + f"tweets_2017-2020_{subfix}.nc")

In [None]:
# ds_norm = normalize_text_bootcamp.normalize_text_dataset(ds)
# generate_datasets(ds_norm, "era5_normed")

In [None]:
ds_norm_filtered = normalize_text_bootcamp.normalize_filter_dataset(
    ds,
    remove_punctuations="all",
    replace_keyword_emojis=False,
)

In [None]:
generate_datasets(ds_norm_filtered, "era5_normed_filtered")

### Check small dataset

In [None]:
ds_tmp = load_tweets(FOLDER_TWEETS + "tweets_2017_01_era5_normed_filtered.nc")

In [None]:
sorted(list(ds_tmp.keys()))

### **Precipitation**

In [None]:
ds_prec = xarray.load_dataset(FOLDER_PRECIPITATION + "ds_prec_era5_uk_2017-2020_decum.nc")

In [None]:
ds_prec

In [None]:
a2.plotting.histograms.plot_histogram(ds_prec.tp_h.values[~np.isnan(ds_prec.tp_h.values)], log=["symlog", "log"])

In [None]:
ds_prec["tp_h"].values[np.logical_and(~np.isnan(ds_prec.tp_h.values), ds_prec.tp_h.values > 0)].min()

In [None]:
tp = ds_prec.tp_h.values.copy()
tp[tp < 0] = 0
ds_prec["tp_h"] = (ds_prec.coords, tp)

In [None]:
a2.plotting.histograms.plot_histogram(
    ds_prec.tp_h.values[~np.isnan(ds_prec.tp_h.values)], log=["symlog", "log"], symlog_linear_threshold=1e-8
)

### Precipitation map at location of Tweets

In [None]:
a2.plotting.weather_maps.plot_precipiation_map(ds_prec, ds_tmp.sel(index=slice(10, 20)));

In [None]:
def split_dataset_by_year(
    ds,
    prefix="ds_precipitation_",
    years=[2017, 2021],
    folder=FOLDER_PRECIPITATION,
    key_time="time_half",
):
    years_range = np.arange(*years)
    for y in years_range:
        ds_y = ds.where(
            (ds[key_time] > np.datetime64(f"{y}-01-01T00:00:00"))
            & (ds[key_time] < np.datetime64(f"{y+1}-01-01T00:00:00")),
            drop=True,
        )
        ds_y.to_netcdf(f"{folder}{prefix}{y}.nc")


split_dataset_by_year(ds_prec)
ds_prec

In [None]:
ds_17 = xarray.load_dataset(FOLDER_PRECIPITATION + "ds_precipitation_2017.nc")

In [None]:
np.nanmin(ds_17["tp_h"].values)