# Train simple DeBERTa model

- Training model on model [DeBERTa small](https://huggingface.co/microsoft/deberta-v3-small/tree/main)
- Results:
    - Best model trained on all Tweets (2017-2020, ~1.3Mio) gives f1-score of minority class of 0.66
    - Best model trained on small subset of Tweets (2020) gives f1-score of minority class of 0.64
- `MLflow` logging included, with logging results saved on 'cloud.mantik.ai'
- Visualize results in form of confusion matrix, roc curve and certainty of classifier 
- Plot weather maps around location of Tweet to check for clear discrepancies between weather forecast and information given in Tweets
- Setting up environment as ipython kernel based on singularity image outlined in `singularity_images/README_singularity_image.md`

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys

LIBRARY_PATH = "/p/home/jusers/ehlert1/juwels/a2/src/"
sys.path.append(LIBRARY_PATH)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import re
import os
import logging

logging.basicConfig(level=logging.DEBUG)

import xarray
import torch
import torch.nn.functional
import datasets

import sklearn.metrics
import sklearn.model_selection

import transformers

import ray
import ray.tune
import ray.tune.integration.mlflow
import a2.plotting
import a2.dataset
import a2.utils
import a2.training
import a2.training.dataset_hugging
import a2.training.training_hugging
import a2.training.evaluate_hugging
import mantik
import mlflow

## Setup environment and check for gpu availability

In [None]:
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
!echo $DISABLE_MLFLOW_INTEGRATION

In [None]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

In [None]:
torch.cuda.empty_cache()  # empties gpu memory, may be required when interrupting training due bugs/user input

In [None]:
num_labels = 2

In [None]:
FOLDER_DATA = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/data/tweets/"
FILE_DATA = FOLDER_DATA + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered.nc"

FOLDER_WEATHER_DATA = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/data/precipitation/"

FOLDER_MODEL_OUTPUT = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/"
print(f"Tweet file: {FILE_DATA}")

In [None]:
model_nm = (
    "/p/project/deepacf/maelstrom/ehlert1/deberta-v3-small"  # model repo downloaded from hugging face see link above
)

## Loading prepared dataset

In [None]:
ds_raw = a2.dataset.load_dataset.load_tweets_dataset(FILE_DATA, raw=True)

print(f"loaded {ds_raw.index.shape[0]} tweets")

In [None]:
ds_raw["text"] = (["index"], ds_raw.text_normalized.values.copy())

In [None]:
ds_raw["raining"] = (["index"], np.array(ds_raw.tp_h.values > 1e-8, dtype=int))

## Build training/test set

In [None]:
indices_train, indices_test = sklearn.model_selection.train_test_split(
    np.arange(ds_raw["index"].shape[0]),
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=ds_raw.raining.values,
)

In [None]:
%matplotlib inline
ds_raw.sel(index=indices_test)["raining"].plot.hist();

In [None]:
ds_raw.sel(index=indices_train)["raining"].plot.hist();

In [None]:
dataset_object = a2.training.dataset_hugging.DatasetHuggingFace(model_nm)

In [None]:
dataset = dataset_object.build(ds_raw, indices_train, indices_test)

## Training

In [None]:
hyper_parameters = a2.training.training_hugging.HyperParametersDebertaClassifier()
print(hyper_parameters)
FOLDER_OUTPUT = "output_rainprediction_simpledeberta_large_dataset_finetuning/"

In [None]:
trainer_object = a2.training.training_hugging.HuggingFaceTrainerClass(model_nm)

In [None]:
mlflow.end_run()
a2.training.tracking.initialize_mantik()
with mlflow.start_run(run_name="era5 whole dataset"):
    a2.training.tracking.initialize_mantik()
    mlflow.log_param(
        "data_description",
        "tweets 2017-2020, keywords emojis as description, keywords only, larage dataset",
    )
    trainer = trainer_object.get_trainer(
        dataset,
        hyper_parameters,
        tokenizer=dataset_object.tokenizer,
        folder_output=FOLDER_MODEL_OUTPUT + FOLDER_OUTPUT,
        hyper_tuning=HYPER_TUNING,
        fp16=True,
    )
    trainer.train()
    test_ds = dataset_object.build(ds_raw, indices_train, indices_test, train=False)
    (
        predictions,
        prediction_probabilities,
    ) = a2.training.evaluate_hugging.predict_dataset(test_ds, trainer)

    ds_test = a2.training.evaluate_hugging.build_ds_test(
        ds=ds_raw,
        indices_test=indices_test,
        predictions=predictions,
        prediction_probabilities=prediction_probabilities,
    )
    truth = ds_test.raining.values

    a2.training.tracking.log_metric_classification_report(truth, predictions, step=params["epochs"])

    a2.plotting.analysis.plot_prediction_certainty(
        truth=ds_test["raining"].values,
        prediction_probabilities=ds_test["prediction_probability_raining"].values,
        filename="plot_2d_predictions_truth.pdf",
    )
    mlflow.log_artifact("plot_2d_predictions_truth.pdf")

    a2.plotting.analysis.plot_roc(ds_test.raining.values, predictions, filename="roc.pdf")
    mlflow.log_artifact("roc.pdf")

In [None]:
ds_test.prediction_probability_raining.plot.hist(bins=100);

## Evaluate loaded model

In [None]:
FOLDER_MODEL_TO_LOAD = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/output_rainprediction_simpledeberta_large_dataset/checkpoint-7617"

In [None]:
(
    truth,
    predictions,
    prediction_probabilities,
) = a2.training.evaluate_hugging.make_predictions_loaded_model(ds_raw, indices_test, FOLDER_MODEL_TO_LOAD)

In [None]:
report = a2.plotting.analysis.check_prediction(truth, predictions)
a2.plotting.analysis.plot_prediction_certainty(truth=truth, prediction_probabilities=prediction_probabilities[:, 1])
a2.plotting.analysis.plot_roc(truth, prediction_probabilities[:, 1])
print(report)

## Precipiation map analysis

In [None]:
ds_p = xarray.load_dataset(FOLDER_WEATHER_DATA + "ds_prec_era5_uk_2017-2020.nc")

In [None]:
ds_test = a2.training.evaluate_hugging.build_ds_test(
    ds=ds_raw,
    indices_test=indices_test,
    predictions=predictions,
    prediction_probabilities=prediction_probabilities,
)

In [None]:
ds_test.raining.plot.hist()

In [None]:
ds = ds_test
ds_selected = ds.where(
    (ds.raining == 1) & (ds.prediction_probability_raining > 0.5) & (ds.tp_h > 1e-7) & (ds.tp_h < 2e-6),
    drop=True,
)
print(
    sklearn.metrics.classification_report(
        ds.raining.values,
        ds.prediction_probability_raining > 0.5,
        target_names=["not raining", "raining"],
    )
)
a2.plotting.analysis.plot_prediction_certainty(
    truth=ds["raining"].values,
    prediction_probabilities=ds["prediction_probability_raining"].values,
)

ds_selected = a2.dataset.load_dataset.reset_index_coordinate(ds_selected)
print(f"found {ds_selected.index.shape[0]} tweets in total")
indices = np.random.choice(
    ds_selected.index.shape[0],
    40,
    replace=False,
)
print(indices)
a2.plotting.weather_maps.plot_precipiation_map(
    ds_p,
    ds_selected.sel(index=indices),
    n_time=2,
    delta_time=1,
    delta_time_units="h",
    delta_longitude=1.2,
    delta_latitude=1.2,
    # filename="precipitation_maps_around_tweets.png",
    add_time_before_plot=pd.Timedelta("30min"),
    print_additional=[
        "bounding_box_area",
        "prediction_probability_raining",
        "tp_h",
    ],
)

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds_test.tp_h.values,
    ds_test.prediction_probability_raining.values,
    log=["symlog", False],
    linear_thresh=1e-9,
    xlim=[-1, 1],
)

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds_test.tp_h.values,
    ds_test.prediction_probability_raining.values,
    log=["log", False],
    xlim=[1e-7, 1],
    n_bins=120,
)