# Cluster Tweets based on their embeddings 

- Retrieve embeddings from finetuned DeBERTa model for Tweets
- Cluster Tweets with `sklearn.manifold.TSNE` in both 2D and 3D
- Visualize results
- Results:
    - Clustering provides only limited additinal information. 
    - Small clusters can be identified by hand with topics not related to rain classification (e.g. "holidays")
    - However, clustering algorithm results vary based on parameters and random initialisation.

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import plotly.express
import matplotlib.pyplot as plt
import sklearn.manifold
import gc
import sys
import guppy
import tqdm
import memory_profiler
import torch
import openTSNE
import xarray

import a2.training.training_hugging
import a2.training.evaluate_hugging
import a2.training.dataset_hugging
import a2.dataset
import a2.utils

In [None]:
FOLDER_MODEL_PRETRAINED = "../../models/model_weights/output_rainprediction_simpledeberta/era5/checkpoint-7617/"
FOLDER_MODEL = "microsoft/deberta-v3-small"
# FILE_TWEETS = "../../../maelstrom_bootcamp/Applications/AP2/bootcamp2022_data/tweets/tweets_2017_01_era5_normed_filtered.nc"
FOLDER_TWEETS = "/home/kristian/Projects/a2/data/tweets/"
FILE_TWEETS = FOLDER_TWEETS + "tweets_2017_era5_normed_filtered_predicted_simpledeberta.nc"
FOLDER_EMBEDDINGS = "/home/kristian/Projects/a2/data/embeddings/cls_token/"
FILE_EMBEDDINGS = FOLDER_EMBEDDINGS + "cls_tokenstweets_2017_era5_normed_filtered.nc.npy"
!ls $FILE_TWEETS

In [None]:
ds = a2.dataset.load_dataset.load_tweets_dataset(FILE_TWEETS)
ds["raining"] = (["index"], np.array(ds.tp_h.values > 1e-8, dtype=int))  # above numerical noise

## Get cls token embeddings

In [None]:
def get_cls_representation(
    ds: xarray.Dataset,
    folder_tokenizer: str,
    folder_model: str,
    key_label: str = "raining",
    key_inputs: str = "text_normalized",
):
    (
        indices_train,
        indices_validate,
    ) = a2.training.training_hugging.split_training_set(ds, key_stratify=key_label, test_size=0.2)

    dataset_spawner = a2.training.dataset_hugging.DatasetHuggingFace(folder_tokenizer)
    dataset = dataset_spawner.build(
        ds,
        indices_train=indices_train,
        indices_validate=indices_validate,
        train=False,
        key_inputs=key_inputs,
        key_label=key_label,
    )

    trainer_spawner = a2.training.training_hugging.HuggingFaceTrainerClass(folder_model, num_labels=2)
    trainer = trainer_spawner.get_trainer(
        dataset,
        tokenizer=dataset_spawner.tokenizer,
        evaluate=True,
        mantik=False,
        fp16=False,
    )

    model = trainer.model
    model = model.eval()

    def get_batch(dataset_spawner, i_start, i_end, model):
        inputs = dataset_spawner.tokenizer(
            ds[key_inputs].values.tolist()[i_start : i_end + 1],
            return_tensors="pt",
            padding=True,
        )
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            last_hidden_states = outputs.hidden_states[-1]
            del outputs, inputs, model
            return last_hidden_states[:, 0, :]

    cls_representations = []
    indices = np.arange(ds["index"].shape[0])
    n_per_batch = max([len(indices) // 1000, 1])
    for indices_batch in tqdm.tqdm(np.array_split(indices, n_per_batch)):
        i_start, i_end = indices_batch[0], indices_batch[-1]
        batch = get_batch(dataset_spawner, i_start, i_end, model)
        gc.collect()
        cls_representations.extend(batch)
    return np.array([x.detach().numpy() for x in cls_representations])

In [None]:
ar = np.arange(100)
for x in np.array_split(ar, 9):
    print(ar[x[0] : x[-1] + 1])

In [None]:
x = np.array_split(np.arange(20), 100)[-1]
x[0], x[-1]

In [None]:
n_sample = 10000
ds_test = ds.sel(index=slice(n_sample))
indices_validate = np.arange(ds_test.index.shape[0])

(truth, predictions, prediction_probabilities,) = a2.training.evaluate_hugging.make_predictions_loaded_model(
    ds_test,
    indices_validate=indices_validate,
    folder_model=FOLDER_MODEL_PRETRAINED,
    folder_tokenizer=FOLDER_MODEL,
    key_inputs="text_normalized",
    fp16=False,
)
ds_test = a2.training.evaluate_hugging.build_ds_test(ds_test, indices_validate, predictions, prediction_probabilities)

In [None]:
cls_tokens = get_cls_representation(
    ds_test,
    folder_tokenizer=FOLDER_MODEL,
    folder_model=FOLDER_MODEL_PRETRAINED,
    key_label="raining",
    key_inputs="text_normalized",
)

In [None]:
cls_tokens.shape

## Clustering

In [None]:
np.save(f"ds_test_{n_sample}_tokens.npy", cls_tokens)
ds_test.to_netcdf(f"ds_test_{n_sample}.nc")

In [None]:
tsne = sklearn.manifold.TSNE(init="pca", learning_rate="auto", n_iter=5000, perplexity=200, n_jobs=14)

In [None]:
projections = tsne.fit_transform(cls_tokens)

In [None]:
def get_values(ds: xarray.Dataset, key: str, end: int = n_sample + 1):
    return ds[key].values[:end]


hover_keys = ["text_normalized", "raining", "prediction_probability_raining"]
fig = plotly.express.scatter(
    data_frame=ds_test.to_dataframe(),
    x=projections.T[0],
    y=projections.T[1],
    # color="difference_prediction",
    hover_data=hover_keys,
    facet_col="raining",
    color_continuous_scale="Aggrnyl",
)
fig.show()

In [None]:
ds_test["difference_prediction"] = (
    ["index"],
    np.abs(ds_test.prediction_probability_raining.values - ds_test.raining.values),
)

## 2017 Tweets
(downloaded from juwels)

In [None]:
ds = a2.dataset.load_dataset.load_tweets_dataset(FILE_TWEETS)
ds["raining"] = (["index"], np.array(ds.tp_h.values > 1e-8, dtype=int))

In [None]:
projections = np.load(
    "/home/kristian/Projects/a2/data/clustering/projections_initrandom_perplexity50tweets_2017_era5_normed_filtered.nc.npy"
)

In [None]:
hover_keys = ["text_normalized", "raining"]
mask = a2.utils.utils.get_random_indices(10000, ds.index.shape[0])
fig = plotly.express.scatter(
    data_frame=ds.sel(index=mask).to_dataframe(),
    x=projections[mask].T[0],
    y=projections[mask].T[1],
    color="prediction_probability_raining",
    hover_data=hover_keys,
    facet_col="raining",
    color_continuous_scale="Aggrnyl",
)
fig.show()

## Clustering in 3D

In [None]:
tsne_3d = sklearn.manifold.TSNE(n_components=3, init="pca", learning_rate="auto")

In [None]:
projections_3d = tsne_3d.fit_transform(cls_tokens)

In [None]:
def get_values(ds: xarray.Dataset, key: str, end: int = n_sample + 1):
    return ds[key].values[:end]


hover_keys = ["text_normalized", "raining", "prediction_probability_raining"]
fig = plotly.express.scatter_3d(
    data_frame=ds_test.to_dataframe(),
    x=projections_3d.T[0],
    y=projections_3d.T[1],
    z=projections_3d.T[2],
    color="difference_prediction",
    hover_data=hover_keys,
    size_max=1,
)
fig.show()

In [None]:
ds.where(
    ~a2.dataset.utils_dataset.is_nan(ds, "geo.coordinates.coordinates"),
    drop=True,
)

In [None]:
ds.latitude_rounded

## Using opentsne
Alternative implementation

In [None]:
cls_tokens = np.load(FILE_EMBEDDINGS)

In [None]:
n_sample = 10000
exaggeration = 1
perplexity = 50
mask = a2.utils.utils.get_random_indices(n_sample, ds.index.shape[0])

In [None]:
tsne = openTSNE.TSNE(
    n_jobs=14,
    random_state=42,
    verbose=True,
    perplexity=perplexity,
    exaggeration=exaggeration,
)
projections = tsne.fit(cls_tokens[mask])

In [None]:
hover_keys = ["text_normalized", "raining"]
fig = plotly.express.scatter(
    data_frame=ds.sel(index=mask).to_dataframe(),
    x=projections.T[0],
    y=projections.T[1],
    color="prediction_probability_raining",
    hover_data=hover_keys,
    facet_col="raining",
    color_continuous_scale="Aggrnyl",
    width=1000,
    height=600,
)
fig.show()
fig.write_image(f"tsne_{n_sample}_perplex{perplexity}_exaggeration{exaggeration}.pdf")