# Code used to prepare Exercises for Bootcamp (Just for reference)

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
import pathlib
import datetime
import numpy as np
import xarray
import matplotlib

sys.path.append("../scripts")
import normalize_text_bootcamp
import dataset_bootcamp
import utils_bootcamp
import plotting

import re

In [None]:
FOLDER_TO_TWEETS = "/p/project/training2223/a2/data/tweets/tweets_2017_normalized.nc"
FOLDER_TO_TWEETS = "../../data/tweets/tweets_2017_normalized.nc"
FOLDER_TO_TWEETS = "/p/project/training2223/a2/base_data/"
FILE_TWEETS = FOLDER_TO_TWEETS + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots.nc"


def load_tweets(folder):
    ds = xarray.load_dataset(folder)
    ds = dataset_bootcamp.reset_index_coordinate(ds)
    return ds

In [None]:
ds = load_tweets(FILE_TWEETS)

In [None]:
ds

In [None]:
ds = dataset_bootcamp.reset_index_coordinate(
    ds.drop(
        [
            "withheld.copyright",
            "withheld.country_codes",
            "withheld.scope",
            "time",
        ]
    )
)
ds

In [None]:
ds["text_original"] = (["index"], ds.text.values.copy())
ds_norm = normalize_text_bootcamp.normalize_text_dataset(ds)

In [None]:
ds_norm

In [None]:
def generate_datasets(ds, subfix):
    ds_17 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2018-01-01T00:00:00")),
        drop=True,
    )
    ds_17.to_netcdf(FOLDER_TO_TWEETS + f"tweets_2017_{subfix}.nc")
    ds_17_01 = ds.where(
        (ds.created_at > np.datetime64("2017-01-01T00:00:00")) & (ds.created_at < np.datetime64("2017-02-01T00:00:00")),
        drop=True,
    )
    ds_17_01.to_netcdf(FOLDER_TO_TWEETS + f"tweets_2017_01_{subfix}.nc")
    ds.to_netcdf(FOLDER_TO_TWEETS + f"tweets_2017-2020_{subfix}.nc")


generate_datasets(ds_norm, "era5_normed")

In [None]:
ds_norm_filtered = normalize_text_bootcamp.normalize_filter_dataset(ds)

In [None]:
generate_datasets(ds_norm_filtered, "era5_normed_filtered")

### **Precipitation**

In [None]:
ds_p = xarray.load_dataset("/p/project/training2223/a2/data/precipitation/ds_prec_era5_uk_2017-2020.nc")

In [None]:
ds_p

In [None]:
def split_dataset_by_year(
    ds,
    prefix="ds_precipitation_",
    years=[2017, 2021],
    folder="/p/project/training2223/a2/data/precipitation/",
):
    years_range = np.arange(*years)
    for y in years_range:
        ds_y = ds.where(
            (ds.time > np.datetime64(f"{y}-01-01T00:00:00")) & (ds.time < np.datetime64(f"{y+1}-01-01T00:00:00")),
            drop=True,
        )
        ds_y.to_netcdf(f"{folder}{prefix}{y}.nc")


split_dataset_by_year(ds_p)
ds_p

In [None]:
ds_17 = xa

## Filtered (normalized) dataset version

In [None]:
ds_tweets_full = load_tweets("../../data/tweets/tweets_2017-2020_normalized.nc")

In [None]:
ds_tweets_fn = normalize_text_bootcamp.normalize_filter_dataset(ds_tweets_full)

In [None]:
ds_tweets_fn.to_netcdf("../../data/tweets/tweets_2017-2020_normalized_filtered.nc")

In [None]:
ds_tweets_fn

In [None]:
# small dataset

In [None]:
ds_tweets_17 = load_tweets("../../data/tweets/tweets_2017_normalized.nc")

In [None]:
ds_tweets_17_fn = normalize_text_bootcamp.normalize_filter_dataset(ds_tweets_17)

In [None]:
ds_tweets_17_fn.to_netcdf("../../data/tweets/tweets_2017_normalized_filtered.nc")

In [None]:
ds_tweets_17_fn

In [None]:
def drop_var_save(drop_variables=["text"]):
    FOLDER_TWEETS = "../../data/tweets/"
    folders = [
        FOLDER_TWEETS + x
        for x in [
            "tweets_2017-2020_normalized_filtered.nc",
            "tweets_2017-2020_normalized.nc",
            "tweets_2017_normalized.nc",
            "tweets_2017_normalized_filtered.nc",
        ]
    ]
    for f in folders:
        ds_tweets = load_tweets(f)
        for var in drop_variables:
            if var in ds_tweets.variables.keys():
                ds_tweets.drop([var])
        ds_tweets.to_netcdf(f)


drop_var_save()

## ERA5 dataset preprocess

In [None]:
ds_tp = xarray.load_dataset(
    "/home/kristian/Downloads/adaptor.mars.internal-1663932222.0837457-20356-1-fc1bb587-1cee-4ba4-8686-658e3cd9bca8.nc"
)

In [None]:
ds_tp

In [None]:
ds_tp.tp.shape

In [None]:
ds_tp.time.values

In [None]:
time_half = ds_tp.time.values + np.timedelta64(datetime.timedelta(minutes=30))
time_half = time_half[:-1]
time_half

In [None]:
ds = xarray.Dataset(
    coords=dict(
        longitude=ds_tp.longitude.values,
        latitude=ds_tp.latitude.values,
        time=time_half,
    ),
    data_vars=dict(tp=(["time", "latitude", "longitude"], np.diff(ds_tp.tp.values, axis=0))),
)

In [None]:
# ds.to_netcdf('../../data/precipitation/ds_prec_era5_uk_2017-2020.nc')

In [None]:
ds.sel(time=np.datetime64("2020-01-01T02:30:00.000000000")).tp.plot()

In [None]:
ds_old = xarray.load_dataset("../../data/precipitation/ds_precipitation_2020.nc")

In [None]:
ds_old.sel(time=np.datetime64("2020-01-01T02:30:00.000000000")).tp.plot(
    xlim=[-9, 3],
    ylim=[49, 61],
    norm=matplotlib.colors.LogNorm(vmax=1e-3, vmin=1e-6),
    cmap="ocean_r",
)

In [None]:
ds.sel(time=np.datetime64("2020-01-01T02:30:00.000000000")).tp.plot(
    xlim=[-9, 3],
    ylim=[49, 61],
    norm=matplotlib.colors.LogNorm(vmax=1e-3, vmin=1e-6),
    cmap="ocean_r",
)

In [None]:
ds_old_red = ds_old.where(
    (ds_old.latitude <= 61)
    & (ds_old.latitude >= 49)
    & (ds_old.longitude >= -9)
    & (ds_old.longitude <= 3)
    & (ds.time <= np.datetime64("2020-12-31T22:30:00")),
    drop=True,
)

In [None]:
# ds_old_red.to_netcdf('../../data/precipitation/ds_prec_uk_2020.nc')

In [None]:
ds_old_red

In [None]:
ds

In [None]:
ds_2020 = ds.where(
    (ds.time >= np.datetime64("2020-01-01T00:30:00")) & (ds.time < np.datetime64("2021-01-01T00:30:00")),
    drop=True,
)

In [None]:
# ds_2020.to_netcdf('../../data/precipitation/ds_prec_era5_uk_2020.nc')

In [None]:
mask = ~np.isnan(ds_2020.tp.values)
x = ds_2020.tp.values[mask]
y = ds_old_red.tp.values[mask]
plotting.histograms.plot_histogram_2d(
    x,
    y,
    label_x="new tp",
    label_y="old tp",
    log="symlog",
    linear_thresh=1e-8,
    norm="log",
    cmap="tab20c",
)

In [None]:
np.sum(x > 0) / np.sum(y > 0)