Notebook details:
* handles all or slice from dataset (random choice of rows)
* text preprocessing: remove all urls, remove empty chunk rows
* text is devided in chunks to avoid truncation by BERT model
* no parametertunig of any of the models
* dimensionality reduction is done with UMAP
* clustering with HBDSCAN
* labeling is done by counting the most common words in each cluster after lemmatization
* results and parameters for each run are saved in folder
--------

In [None]:
from pathlib import Path
import pandas as pd
from pandas import DataFrame
from typing import List
import numpy as np
import torch
from transformers import BertTokenizer, BertTokenizerFast, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
from hdbscan import HDBSCAN
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
from datetime import datetime
import json
import webbrowser

from tools.text.filtering import remove_urls
from preprocessing import read_raw_data_to_df, prepare_df, add_text_chunks_to_df
from cluster_labeling import c_tf_idf, extract_top_n_words_per_topic, extract_cluster_sizes
from visualisation import create_topic_cluster_scatter, custom_scatter_layout
from save_results import create_experiment_folder, save_plot, save_dataframe, save_parameters


### Create df from csv/tsv file

In [None]:
__file__ = '/home/dorota/projects/python/investigations/dorota_lia/text_classification/src/UMAP_HBDSCAN.ipynb' #TODO remove when working from .py as __file__ is defined in .py but not .ipunb

In [None]:
file_name = 'vnnforum_small.tsv'
df = read_raw_data_to_df(file_name)

In [None]:
df = df
col_containing_text = 'text'
n_rows = 1000

df = prepare_df(df, col_containing_text, n_rows)

### Save text in chunks short enough for model to handle and add to df

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

id_col = 'article_id'
df = add_text_chunks_to_df(df = df, tokenizer=tokenizer, id_column_name = id_col)

### Calculate embeddings, add to df and save in datawarehouse

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

chunk_list = df["text_chunk"].to_list()
embeddings = model.encode(chunk_list, show_progress_bar=True, normalize_embeddings=True)

df['chunk_embedding'] = list(embeddings)

In [None]:
datawarehouse_folder = Path(__file__).parents[1] / 'datawarehouse'
datawarehouse_folder.mkdir(parents=True, exist_ok=True)
df.to_csv(f'{datawarehouse_folder}/{file_name}_chunked_embeddings.tsv', sep="\t", index=False)

---
---

### ALT Read all data into df and extract embeddings array instead of the steps above

In [None]:
# # provide data to run and save results if script run from here
# file_name = 'vnnforum_small.tsv'
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = SentenceTransformer('all-MiniLM-L6-v2')

# __file__ = '/home/dorota/projects/python/investigations/dorota_lia/text_classification/src/UMAP_HBDSCAN.ipynb' #TODO remove when working from .py as __file__ is defined in .py but not .ipunb

In [None]:
# datawarehouse_folder = Path(__file__).parents[1] / 'datawarehouse'
# file_name = 'vnnforum_small.tsv'
# df = pd.read_csv(f'{datawarehouse_folder}/{file_name}_chunked_embeddings.tsv', sep="\t")

# chunk_list = df["text_chunk"].to_list()
# embeddings = df['chunk_embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist()
# embeddings = np.array(embeddings)

---
---

### Create folder for current experiment

In [None]:
base_path = Path(__file__).parents[1]
file_name = file_name
exp_folder_path = create_experiment_folder(base_path, file_name)

### Reduce embedings dimensionality with UMAP and cluster with HBDSCAN, add cluster label to df

In [None]:
umap_params = {
    'n_neighbors':20,
    'n_components':8,
    'min_dist':0.05, 
    'metric':'cosine'
}

umap_embeddings = UMAP(**umap_params).fit_transform(embeddings)

In [None]:
hbdscan_params = {
    'min_cluster_size':20,
    'metric':'euclidean',
    #'min_samples':40,
    'gen_min_span_tree':True,
    'prediction_data':True,       
    'cluster_selection_method':'eom'
}

cluster = HDBSCAN(**hbdscan_params).fit(umap_embeddings)

In [None]:
df['cluster_label'] = cluster.labels_

### Calculate most frequent words for each cluster

In [None]:
# create a df where all text_chunks for each cluster are joined
clustered_docs_df = df.groupby(['cluster_label'], as_index = False).agg({'text_chunk': ' '.join})

In [None]:
# lemmatize clustered text_chunks and save as new column in df
nlp = spacy.load("en_core_web_sm")
clustered_docs_df['lemmatized_text_chunk'] = clustered_docs_df['text_chunk'].apply(lambda text: ' '.join(token.lemma_ for token in nlp(text)))
# NOTE: words like jews, jewish are not lemmatized to jew, which should be further addressed with SpaCy

In [None]:
# calculate cluster sizes and most common words for each cluster
tf_idf, count = c_tf_idf(clustered_docs_df['lemmatized_text_chunk'].values, m=len(chunk_list))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, clustered_docs_df, n=10)
cluster_words_df = extract_cluster_sizes(df)
cluster_words_df['top_words'] = cluster_words_df['cluster_label'].apply(lambda label: [word for word, _ in top_n_words[label]])
print(cluster_words_df)

# *1) merging here results in scattered clusters in visualization step

### Prepare for visualization in 2D and gather all in df

In [None]:
# use the same paramters for 2D UMAP as for initial dim reduction except for n_components = 2 (=> 2 dim)
umap_params_2D = umap_params.copy()
umap_params_2D['n_components'] = 2

umap_embeddings_2D = UMAP(**umap_params_2D).fit_transform(embeddings)

df['umap_x'] = umap_embeddings_2D[:, 0]
df['umap_y'] = umap_embeddings_2D[:, 1]

df = df.merge(cluster_words_df, on='cluster_label').drop('cluster_size', axis=1) # *1)

### Visualize topic clusters and save plot

In [None]:
fig = create_topic_cluster_scatter(df = df, category = 'cluster_label')

fig = custom_scatter_layout(fig = fig, plot_title = 'VNN blogg grouped by cluster', x_title = 'umap_x', y_title = 'umap_y')
fig.show()

save_plot(fig, exp_folder_path)


In [None]:
# open figure in browser
cluster_plot_path = exp_folder_path / 'fig_clustered_text_data.html'
webbrowser.open(str(cluster_plot_path), new=2)  # 'new=2' opens in a new tab or window

### Save df with all data

In [None]:
save_dataframe(df, exp_folder_path)

### Save paramteres

In [None]:
params = {
    'raw_data_file': file_name,
    'tokenizer_for_creating_chunks': str(tokenizer),
    'embeddings_model': str(model),
    'umap_params': umap_params,
    'hdbscan_params': hbdscan_params
}

save_parameters(params, exp_folder_path)