# Train DeBERTa model to assign relevance for rain prediction to Tweets

- Training model on model [DeBERTa small](https://huggingface.co/microsoft/deberta-v3-small/tree/main)
- Results:
    - Best model trained on all Tweets (2017-2020, ~1.3Mio) gives f1-score of minority class of 0.66
    - Best model trained on small subset of Tweets (2020) gives f1-score of minority class of 0.64
- `MLflow` logging included, with logging results saved on 'cloud.mantik.ai'
- Visualize results in form of confusion matrix, roc curve and certainty of classifier 
- Plot weather maps around location of Tweet to check for clear discrepancies between weather forecast and information given in Tweets
- Setting up environment as ipython kernel based on singularity image outlined in `singularity_images/README_singularity_image.md`

In [None]:
!pwd

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import sys

LIBRARY_PATH = "/p/project/deepacf/maelstrom/ehlert1/a2/src/"
sys.path.append(LIBRARY_PATH)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import re
import os
import logging

logging.basicConfig(level=logging.INFO)

import xarray
import torch
import torch.nn.functional
import datasets

import sklearn.metrics
import sklearn.model_selection

import transformers

import ray
import ray.tune
import ray.tune.integration.mlflow

import a2.utils.file_handling
import a2.preprocess.normalize_text
import a2.training.dataset_hugging
import a2.training.training_hugging
import a2.training.evaluate_hugging
import mantik
import mlflow

## Setup environment and check for gpu availability

In [None]:
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
!echo $DISABLE_MLFLOW_INTEGRATION

In [None]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

In [None]:
torch.cuda.empty_cache()  # empties gpu memory, may be required when interrupting training due bugs/user input

In [None]:
FOLDER_TWEETS = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/data/tweets/"
FILE_DATA = (
    FOLDER_TWEETS + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered.nc"
)
FILE_DATA = (
    FOLDER_TWEETS
    + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
)
# FILE_DATA = FOLDER_TWEETS + "2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
# FILE_DATA = FOLDER_TWEETS + "2017_2020_tweets_keywords_near_station.nc"

FOLDER_WEATHER_DATA = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/data/precipitation/"
FOLDER_MODEL_OUTPUT = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/"
print(f"Tweet file: {FILE_DATA}")

In [None]:
all_files = a2.utils.file_handling.get_all_files(FOLDER_TWEETS + "tweets_no_keywords/*.json")
ds_irr = a2.dataset.load_dataset.load_tweets_dataframe_from_jsons(all_files).to_xarray()
print(f"loaded {ds_irr.index.shape[0]} tweets")

In [None]:
ds_irr

In [None]:
HYPER_TUNING = False

FOLDER_MODEL = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/deberta-v3-base"

num_labels = 2

## Loading prepared dataset

In [None]:
ds_rev = a2.dataset.load_dataset.load_tweets_dataset(FILE_DATA, raw=True)
print(f"loaded {ds_rev.index.shape[0]} tweets")

In [None]:
ds_rev_sel = ds_rev.sel(index=slice(0, ds_irr.index.shape[0]))

In [None]:
def merge_datasets_along_index(ds_top, ds_bottom):
    ds_bottom_reindexed = ds_bottom.copy()
    start_index = ds_top.index.shape[0]
    ds_bottom_reindexed["index"] = range(start_index, start_index + ds_bottom_reindexed.index.shape[0])
    return xarray.merge([ds_top, ds_bottom_reindexed])

In [None]:
ds_rev_sel["relevant"] = (["index"], np.ones(ds_rev_sel.index.shape[0], dtype=bool))
ds_irr["relevant"] = (["index"], np.zeros(ds_irr.index.shape[0], dtype=bool))

In [None]:
ds_raw = merge_datasets_along_index(ds_rev_sel, ds_irr)

In [None]:
ds_raw["relevant"].plot.hist()

In [None]:
ds_normalized_filtered = a2.preprocess.normalize_text.normalize_filter_dataset(
    ds_raw,
    keywords=None,
    reset_index=True,
    key_text_original="text",
    key_text_normalized="text_normalized",
    key_text_backup="text_original",
    ignore_non_ascii=False,
    replace_keyword_emojis=False,
    remove_punctuations="keep_basic_punctuations",
    reduce_punctuations=True,
    use_lower_case=False,
    do_split_punctuation_text=True,
    remove_sun_confusing_terms=True,
    only_text_containing_keywords=False,
    maximum_bounding_box_area=None,
    only_unique_text=True,
    processes=-1,
    keep_emojis="all",
)

In [None]:
a2.dataset.utils_dataset.print_tweet_sample(ds_normalized_filtered, additional_fields=["text_normalized"])

In [None]:
key_label = "relevant"
# key_label = "raining"
key_inputs = "text_normalized"

In [None]:
ds_raw["text"] = (["index"], ds_raw[key_inputs].values.copy())

In [None]:
trainer_object = a2.training.training_hugging.HuggingFaceTrainerClass(FOLDER_MODEL, num_labels=num_labels)

## Build training/test set

In [None]:
indices_train, indices_test = sklearn.model_selection.train_test_split(
    np.arange(ds_raw["index"].shape[0]),
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=ds_raw[key_label].values,
)

In [None]:
%matplotlib inline
ds_raw.sel(index=indices_test)[key_label].plot.hist();

In [None]:
ds_raw.sel(index=indices_train)[key_label].plot.hist();

In [None]:
dataset_object = a2.training.dataset_hugging.DatasetHuggingFace(FOLDER_MODEL)

In [None]:
dataset = dataset_object.build(ds_raw, indices_train, indices_test, key_inputs=key_inputs, key_label=key_label)

## Training

In [None]:
hyper_parameters = a2.training.training_hugging.HyperParametersDebertaClassifier()
print(hyper_parameters)
# FOLDER_OUTPUT = "output_rainprediction_simpledeberta_large_dataset_finetuning/"
FOLDER_OUTPUT = f"output_{key_label}_modelling/"

In [None]:
trainer_object = a2.training.training_hugging.HuggingFaceTrainerClass(FOLDER_MODEL, num_labels=num_labels)

In [None]:
test_ds = dataset_object.build(ds_raw, indices_train, indices_test, train=False)

# test_ds.drop_columns({'label': 'labels'})

In [None]:
trainer = trainer_object.get_trainer(
    dataset,
    hyper_parameters,
    tokenizer=dataset_object.tokenizer,
    folder_output=FOLDER_MODEL_OUTPUT + FOLDER_OUTPUT,
    hyper_tuning=HYPER_TUNING,
    fp16=True,
)
trainer.train()

In [None]:
# trainer.predict(test_ds)
(
    predictions,
    prediction_probabilities,
) = a2.training.evaluate_hugging.predict_dataset(test_ds, trainer)

ds_test = a2.training.evaluate_hugging.build_ds_test(
    ds=ds_raw,
    indices_test=indices_test,
    predictions=predictions,
    prediction_probabilities=prediction_probabilities,
)
truth = ds_test[key_label].values

# a2.training.tracking.log_metric_classification_report(truth, predictions, step=params["epochs"])

# a2.plotting.analysis.plot_prediction_certainty(
#     truth=ds_test["raining"].values,
#     prediction_probabilities=ds_test["prediction_probability_raining"].values,
#     filename="plot_2d_predictions_truth.pdf",
# )
# mlflow.log_artifact("plot_2d_predictions_truth.pdf")

# a2.plotting.analysis.plot_roc(ds_test.raining.values, predictions, filename="roc.pdf")
# mlflow.log_artifact("roc.pdf")

## Evaluate loaded model

In [None]:
FOLDER_MODEL_TO_LOAD = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/output_rainprediction_simpledeberta_large_dataset/checkpoint-7617"
FOLDER_MODEL_TO_LOAD = (
    "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models_output/output_relevant_modelling/checkpoint-3000"
)

In [None]:
(truth, predictions, prediction_probabilities,) = a2.training.evaluate_hugging.make_predictions_loaded_model(
    ds_raw, indices_test, FOLDER_MODEL_TO_LOAD, key_inputs=key_inputs, key_label=key_label
)

In [None]:
report = a2.plotting.analysis.check_prediction(truth, predictions)
a2.plotting.analysis.plot_prediction_certainty(truth=truth, prediction_probabilities=prediction_probabilities[:, 1])
a2.plotting.analysis.plot_roc(truth, prediction_probabilities[:, 1])
print(report)

## Make predictions on custom Tweets 

In [None]:
ds_uncertain = ds_rev.where(
    (ds_rev["prediction_probability_raining"] > 0.3) & (ds_rev["prediction_probability_raining"] < 0.7), drop=True
)

In [None]:
ds_uncertain[key_label] = (["index"], np.ones(ds_uncertain.index.shape[0], dtype=bool))

In [None]:
(truth, predictions, prediction_probabilities,) = a2.training.evaluate_hugging.make_predictions_loaded_model(
    ds_uncertain, slice(None), FOLDER_MODEL_TO_LOAD, key_inputs=key_inputs, key_label=key_label
)

In [None]:
report = a2.plotting.analysis.check_prediction(truth, predictions)
a2.plotting.analysis.plot_prediction_certainty(truth=truth, prediction_probabilities=prediction_probabilities[:, 1])
a2.plotting.analysis.plot_roc(truth, prediction_probabilities[:, 1])
print(report)

In [None]:
prediction_probabilities[:, 1]

In [None]:
ds_uncertain["prediction_probability_relevant"] = (["index"], prediction_probabilities[:, 1])

In [None]:
ds_irrev_pred = ds_uncertain.where(ds_uncertain["prediction_probability_relevant"] < 0.7, drop=True)

In [None]:
a2.dataset.utils_dataset.print_tweet_sample(
    ds_irrev_pred, additional_fields=["text_normalized", "prediction_probability_relevant"]
)

## Make rain predictions on "certain" dataset

### First make relevance prediction on whole dataset

In [None]:
from dotenv import load_dotenv

load_dotenv("/p/project/deepacf/maelstrom/ehlert1/a2/cluster/research/env_dev.sh")

In [None]:
ds_rev["relevant"] = (["index"], np.ones(ds_rev.index.shape[0], dtype=bool))

In [None]:
tracker = a2.training.tracking.Tracker()

In [None]:
(truth, predictions, prediction_probabilities,) = a2.training.evaluate_hugging.make_predictions_loaded_model(
    ds_rev, slice(None), FOLDER_MODEL_TO_LOAD, key_inputs=key_inputs, key_label=key_label
)

In [None]:
ds_rev["relevant_prediction"] = (["index"], predictions)
ds_rev["relevant_prediction_probabilities"] = (["index"], prediction_probabilities[:, 1])

In [None]:
ds_rev

In [None]:
FOLDER_MODEL_TO_LOAD = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/models/trained_models/deberta-v3-small/output_rainprediction_simpledeberta_large_dataset_finetuning/checkpoint-2500"

In [None]:
ds_station = ds_rev.where(
    (ds_rev["station_distance_km"] < 2) & (~a2.dataset.utils_dataset.is_nan(ds_rev, "station_tp_mm")), drop=True
)
ds_station["raining_station"] = (["index"], ds_station["station_tp_mm"].values > 0)

In [None]:
(truth, predictions, prediction_probabilities,) = a2.training.evaluate_hugging.make_predictions_loaded_model(
    ds_station, slice(None), FOLDER_MODEL_TO_LOAD, key_inputs="text_normalized", key_label="raining"
)

In [None]:
experiment_id.experiment_id

In [None]:
ds_station["relevant_prediction_probabilities"].plot.hist()

In [None]:
tracker.end_run()
experiment_id = tracker.create_experiment("maelstrom-a2-eval")
with tracker.start_run(experiment_id=experiment_id.experiment_id, run_name="evaluate_raining"):
    filename_check_prediction_plot = "check_prediction_raining_stations_predprob0p7.pdf"
    report = a2.plotting.analysis.check_prediction(
        truth=ds_station["raining_station"].values,
        prediction=ds_station["prediction_probability_raining"].values > 0.7,
        filename=filename_check_prediction_plot,
    )
    tracker.log_artifact(filename_check_prediction_plot)

    filename_check_prediction_plot = "check_prediction_raining_stations_predprob0p7_relevant.pdf"
    report = a2.plotting.analysis.check_prediction(
        truth=ds_station.where(ds_station["relevant_prediction"] == 1, drop=True)["raining_station"].values,
        prediction=ds_station.where(ds_station["relevant_prediction"] == 1, drop=True)[
            "prediction_probability_raining"
        ].values
        > 0.7,
        filename=filename_check_prediction_plot,
    )
    tracker.log_artifact(filename_check_prediction_plot)

    filename_check_prediction_plot = "check_prediction_raining_stations_noSnow_predprob0p7_relevant.pdf"
    report = a2.plotting.analysis.check_prediction(
        truth=ds_station.where(
            (~ds_station["text_normalized"].str.contains("[sS]now")) & (ds_station["relevant_prediction"] == 1),
            drop=True,
        )["raining_station"].values,
        prediction=ds_station.where(
            (~ds_station["text_normalized"].str.contains("[sS]now")) & (ds_station["relevant_prediction"] == 1),
            drop=True,
        )["prediction_probability_raining"].values
        > 0.7,
        filename=filename_check_prediction_plot,
    )
    tracker.log_artifact(filename_check_prediction_plot)

    filename_check_prediction_plot = "check_prediction_raining_stations_predprob0p7_irrelevant.pdf"
    report = a2.plotting.analysis.check_prediction(
        truth=ds_station.where(ds_station["relevant_prediction"] == 0, drop=True)["raining_station"].values,
        prediction=ds_station.where(ds_station["relevant_prediction"] == 0, drop=True)[
            "prediction_probability_raining"
        ].values
        > 0.7,
        filename=filename_check_prediction_plot,
    )
    tracker.log_artifact(filename_check_prediction_plot)

    filename_check_prediction_plot = "check_prediction.pdf"
    report = a2.plotting.analysis.check_prediction(truth, predictions, filename=filename_check_prediction_plot)
    tracker.log_artifact(filename_check_prediction_plot)
    a2.plotting.analysis.plot_prediction_certainty(truth=truth, prediction_probabilities=prediction_probabilities[:, 1])
    a2.plotting.analysis.plot_roc(truth, prediction_probabilities[:, 1])
    filename_check_prediction_plot = "relevant_prediction_2d.pdf"
    a2.plotting.histograms.plot_histogram_2d(
        x="relevant_prediction_probabilities",
        y="prediction_probability_raining",
        ds=ds_station,
        facet_column="raining_station",
        n_bins=[10, 10],
        spacing_x=spacing,
        spacing_y=spacing,
        font_size=10,
        overplot_values=False,
        filename=filename_check_prediction_plot,
    )
    tracker.log_artifact(filename_check_prediction_plot)

print(report)

In [None]:
a2.dataset.utils_dataset.print_tweet_sample(
    ds_station.where(
        (ds_station["relevant_prediction"] == 1)
        & (ds_station["raining_station"] == 0)
        & (ds_station["prediction_probability_raining"] > 0.7),
        drop=True,
    ),
    additional_fields=[
        "text_normalized",
        "raining",
        "prediction",
        "raining_station",
        "relevant_prediction_probabilities",
    ],
    n_sample=20,
)

In [None]:
ds_station["prediction_probability_raining_checkpoint2500"] = (["index"], prediction_probabilities[:, 1])

In [None]:
spacing = 0.12
a2.plotting.histograms.plot_histogram_2d(
    x="prediction_probability_raining_checkpoint2500",
    y="prediction_probability_raining",
    ds=ds_station,
    # facet_column='raining',
    facet_row="raining_station",
    n_bins=[40, 40],
    # xlim=[0, 3],
    # ylim=[0,1],
    spacing_x=spacing,
    spacing_y=spacing,
    font_size=10,
    filename="/tmp/test.pdf",
)

In [None]:
ds_station.where((ds_station["station_tp_mm"] > 0) & (ds_station["raining_station"] == 0), drop=True)

In [None]:
spacing = 0.12
a2.plotting.histograms.plot_histogram_2d(
    x="relevant_prediction_probabilities",
    y="prediction_probability_raining",
    ds=ds_station,
    facet_column="raining_station",
    n_bins=[10, 10],
    # xlim=[0, 3],
    # ylim=[0,1],
    spacing_x=spacing,
    spacing_y=spacing,
    font_size=10,
    # norm='log',
    overplot_values=False,
    filename="/tmp/test.pdf",
)

In [None]:
# truth, predictions = ds_station['s

report = a2.plotting.analysis.check_prediction(truth, predictions)
# a2.plotting.analysis.plot_prediction_certainty(truth=truth, prediction_probabilities=prediction_probabilities)
# a2.plotting.analysis.plot_roc(truth, prediction_probabilities)
print(report)

## Precipiation map analysis

In [None]:
ds_p = xarray.load_dataset(FOLDER_WEATHER_DATA + "ds_prec_era5_uk_2017-2020.nc")

In [None]:
ds_test = a2.training.evaluate_hugging.build_ds_test(
    ds=ds_raw,
    indices_test=indices_test,
    predictions=predictions,
    prediction_probabilities=prediction_probabilities,
)

In [None]:
ds_test.raining.plot.hist()

In [None]:
ds = ds_test
ds_selected = ds.where(
    (ds.raining == 1) & (ds.prediction_probability_raining > 0.5) & (ds.tp_h > 1e-7) & (ds.tp_h < 2e-6),
    drop=True,
)
print(
    sklearn.metrics.classification_report(
        ds.raining.values,
        ds.prediction_probability_raining > 0.5,
        target_names=["not raining", "raining"],
    )
)
a2.plotting.analysis.plot_prediction_certainty(
    truth=ds["raining"].values,
    prediction_probabilities=ds["prediction_probability_raining"].values,
)

ds_selected = a2.dataset.load_dataset.reset_index_coordinate(ds_selected)
print(f"found {ds_selected.index.shape[0]} tweets in total")
indices = np.random.choice(
    ds_selected.index.shape[0],
    40,
    replace=False,
)
print(indices)
a2.plotting.weather_maps.plot_precipiation_map(
    ds_p,
    ds_selected.sel(index=indices),
    n_time=2,
    delta_time=1,
    delta_time_units="h",
    delta_longitude=1.2,
    delta_latitude=1.2,
    # filename="precipitation_maps_around_tweets.png",
    add_time_before_plot=pd.Timedelta("30min"),
    print_additional=[
        "bounding_box_area",
        "prediction_probability_raining",
        "tp_h",
    ],
)

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds_test.tp_h.values,
    ds_test.prediction_probability_raining.values,
    log=["symlog", False],
    linear_thresh=1e-9,
    xlim=[-1, 1],
)

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds_test.tp_h.values,
    ds_test.prediction_probability_raining.values,
    log=["log", False],
    xlim=[1e-7, 1],
    n_bins=120,
)