# Imports

In [1]:
# regex
import re

# pandas + numpy
import numpy as np
import pandas as pd

# setting pandas options
pd.set_option('display.max_colwidth', 200)


# storing and loading models
import pickle

# to set types for functions
from typing import Tuple

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# gpu debug
import torch

# setting device to use GPU for NLP backend if you have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"


# SBERT
from sentence_transformers import SentenceTransformer

# UMAP
from umap import UMAP

#HDBSCAN
from hdbscan import HDBSCAN

# topic finding
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading model from pickle if possible, to avoid downloading it again
try:
    model = pickle.load(open(f'model-{device}.pkl', 'rb'))

    model_load = True

except:
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    pickle.dump(model, open(f'model-{device}.pkl', 'wb'))

    model_load = False

print(f"""
GPUs detected:          {torch.cuda.device_count()}
Using GPU:              {torch.cuda.is_available()}
Device:                 {device}
Got model from pickle:  {model_load}
""")

  from .autonotebook import tqdm as notebook_tqdm



GPUs detected:          0
Using GPU:              False
Device:                 cpu
Got model from pickle:  True



# Function

In [2]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words

## Cleaning

In [3]:
def string_cleaner(input: str) -> str:
    """
    Function to clean up strings.

    Args:
        input (str): String to be cleaned.

    Returns:
        str: Cleaned string.
    """
    
    # turning lowercase
    input = input.lower()

    # removing punctuation and other non-alphanumeric characters
    input = re.sub(r'[^\w\s]', '', input)
    
    return input

## Topic Modeling

In [4]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words



def topic_by_clusterId(result: pd.DataFrame) -> dict:
  """
  Function that maps topics to cluster ids.

  Args:
      result (pd.DataFrame): Dataframe with cluster ids and topics.

  Returns:
      dict: Dictionary with cluster ids as keys and topics as values.
  """

  #print(result.isna().sum())

  df_group = result[["titles", "cluster_label"]].groupby("cluster_label").agg(list).reset_index()

  df_group["topics"] = tfidf_most_relevant_word(df_group["titles"])

  return dict(zip(df_group.cluster_label, df_group.topics))

## Plotting Functions

In [5]:
# when you actually cast the type here, then it works with how pandas casts types and you don't have to worry about copying seriers
def result_df_maker(embeddings: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> pd.DataFrame:
  """
  Function to make a dataframe with the embeddings, cluster labels, topic per cluster label and titles.

  Args:
      embeddings (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      pd.DataFrame: Dataframe with embeddings, cluster labels, topics per cluster, and titles.
  """
  result = pd.DataFrame(embeddings, columns=['x', 'y'])

  result["titles"] = titles

  result["cluster_label"] = cluster_labels

  topic_dict = topic_by_clusterId(result)

  result["topics"] = result["cluster_label"].apply(lambda x: topic_dict[x])

  result["topics"] = result["topics"].apply(lambda x: " ".join(x))

  return result

def result_splitter(result: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  """
  Function to split the dataframe into two dataframes, one for clustered and one for outliers.

  Args:
      result (pd.DataFrame): Dataframe with embeddings, cluster labels, topics per cluster, and titles.

  Returns:
      Tuple[np.ndarray, np.ndarray]: Tuple of two dataframes, one for clustered and one for outliers.
  """

  clustered = result.loc[result.cluster_label != -1, :]
  outliers = result.loc[result.cluster_label == -1, :]
  return clustered, outliers

# the cavalry is not here, but it's fine! Why? I am here!
def result_tracer(clustered: pd.DataFrame, outliers: pd.DataFrame) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      clustered (pd.DataFrame): clustered dataframe to be colored by cluster and get hover data
      outliers (pd.DataFrame): outlier data frame with grey color and no hover data

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  trace_cluster = go.Scattergl(
    x=clustered.x, 
    y=clustered.y, 
    mode="markers", 
    name="Clustered",

    # styling markers
    marker=dict(
      size=2, 
      color=clustered.cluster_label,
      colorscale="Rainbow"
    ), 

    # setting hover text to the titles of the videos
    hovertemplate="<b>Topics:</b> %{customdata[0]} <br><b>Cluster Id:</b> %{customdata[1]}<extra></extra>", 
    customdata=np.column_stack([clustered.topics, clustered.cluster_label]),
  )

  trace_outlier = go.Scattergl(
    x=outliers.x,
    y=outliers.y,
    mode="markers",
    name="Outliers",

    marker=dict(
      size=1,
      color="grey"
    ),

    hovertemplate="Outlier<extra></extra>"
  )

  return trace_cluster, trace_outlier

def result_tracer_wrapper(uembs: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      uembs (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  result = result_df_maker(uembs, cluster_labels, titles)
  clustered, outliers = result_splitter(result)
  trace_cluster, trace_outlier = result_tracer(clustered, outliers)
  return trace_cluster, trace_outlier

In [6]:
def subplotter(trace_nested_list: list, titles: list, base_size=1000) -> go.Figure:
    """
    Function to make a figure with subplots of the clustered and outliers.

    Args:
        trace_nested_list (list): list holding rows of columns, each column holding traces. 
        titles (list): Titles for the subplots
        base_size (int, optional): Base size of the sub plots. Defaults to 1000.

    Returns:
        go.Figure: Figure with subplots.
    """
    
    row_count = len(trace_nested_list)
    col_count = len(trace_nested_list[0])
    
    fig = make_subplots(
        rows=row_count, 
        cols=col_count,
        subplot_titles=(titles),
        vertical_spacing=0.02,
        horizontal_spacing=0.02
    )

    for i, row in enumerate(trace_nested_list):
        for j, col in enumerate(row):

            # adding both outlieers and clustered
            for trace in col:
                fig.add_trace(trace, row=i+1, col=1)
    
    # figure settings
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    fig.update_layout(width=base_size*col_count, height=base_size*row_count, plot_bgcolor='rgba(250,250,250,1)')

    return fig

## Saving / Showing Plots

In [7]:
def fig_show_save(fig: go.Figure, filename: str, show=True):
  """
  Function to show and save a figure.

  Args:
      fig (go.Figure): fig to be saved and shown
      filename (str): filename to save the figure, without extension
      show (bool, optional): Option to disable showing of figure (in case too big for notebook). Defaults to True.
  """
  
  # writing both interactible .html and static image .png
  fig.write_html(f"figures/{filename}.html")
  fig.write_image(f"figures/{filename}.png")

  if show: 
    fig.show()

# Data Part

In [10]:
df_whole = pd.read_csv("data/USvideos.csv")

df = df_whole[["title"]].copy()

if device == "cpu": df = df.sample(frac=0.05)

df

Unnamed: 0,title
37281,Officials Warn Largest Hawaii Volcano Eruption Is ‘Imminent’ | NBC Nightly News
30905,ZAYN - Let Me (Official Video)
8197,Amazon Echo Spot review
1425,Your Amazing Molecular Machines
19314,Talking about my ghost with the PSYCHIC TWINS
...,...
33101,Deep Sea Fishing Battle | Dude Perfect
25888,THE KATE BOSWORTH MAKEUP LOOK TUTORIAL
39221,Erika Costell - Chitty Bang ft. Jake Paul (Official Music Video)
33900,I Learned to Solve the 2x2x2 Rubik's Cube Blindfolded


## Cleaning

In [12]:
df["title_clean"] = df["title"].apply(string_cleaner)

df.head(5)

Unnamed: 0,title,title_clean
37281,Officials Warn Largest Hawaii Volcano Eruption Is ‘Imminent’ | NBC Nightly News,officials warn largest hawaii volcano eruption is imminent nbc nightly news
30905,ZAYN - Let Me (Official Video),zayn let me official video
8197,Amazon Echo Spot review,amazon echo spot review
1425,Your Amazing Molecular Machines,your amazing molecular machines
19314,Talking about my ghost with the PSYCHIC TWINS,talking about my ghost with the psychic twins


In [13]:
embs = model.encode(df["title_clean"].to_numpy())

In [16]:
print(f"""
{embs.shape}
{type(embs)}
{embs.dtype}
{embs[0]}
""")


(2047, 768)
<class 'numpy.ndarray'>
float32
[ 2.62138601e-02  3.64924036e-02 -2.11461224e-02 -7.44758500e-03
  5.11637814e-02  5.43605089e-02 -4.24651336e-03  3.88900191e-02
  3.64230536e-02  1.02379927e-02 -1.77153351e-03 -4.13685180e-02
 -3.77502292e-02  6.76589161e-02 -2.40567178e-02  5.13126608e-03
  1.11519378e-02 -3.30180004e-02  5.34363389e-02  2.41408148e-03
 -7.80649390e-03  4.33169166e-03  1.24374740e-02  4.26485017e-02
 -1.29683651e-02 -4.40440439e-02  1.17934709e-02  1.73471938e-03
  4.06646691e-02 -3.36070582e-02 -8.20032973e-03 -2.14481372e-02
  2.56753005e-02 -1.61409546e-02  1.52730217e-06 -3.18372771e-02
 -5.95334452e-03 -6.47358643e-03 -1.50202645e-03 -1.16389664e-03
 -1.13387786e-01 -8.85904580e-02 -1.86826326e-02  3.65023017e-02
 -2.29447316e-02 -4.46038023e-02  2.30399854e-02 -2.92045921e-02
  1.14875436e-02  4.20489125e-02 -1.51303345e-02 -6.18694983e-02
  9.49067771e-02 -9.01160575e-03  1.22749005e-02 -1.19149568e-03
  4.16443199e-02 -3.20385993e-02 -8.66920203e

In [18]:
df["embs"] = list(embs)

In [21]:
df.head(3)

Unnamed: 0,title,title_clean,embs
37281,Officials Warn Largest Hawaii Volcano Eruption Is ‘Imminent’ | NBC Nightly News,officials warn largest hawaii volcano eruption is imminent nbc nightly news,"[0.02621386, 0.036492404, -0.021146122, -0.007447585, 0.05116378, 0.05436051, -0.0042465134, 0.03889002, 0.036423054, 0.010237993, -0.0017715335, -0.041368518, -0.03775023, 0.067658916, -0.0240567..."
30905,ZAYN - Let Me (Official Video),zayn let me official video,"[0.018647306, 0.030979455, -0.0071687493, -0.016334603, -0.0033036354, 0.0038727506, -0.04967529, 0.0010558967, -0.07953582, 0.016318323, 0.031500656, -0.0063248854, 0.019711064, 0.1052882, 0.0028..."
8197,Amazon Echo Spot review,amazon echo spot review,"[-0.024403362, -0.09118484, -0.008784633, -0.008044011, -0.011623585, -0.032357514, 0.06294571, -0.030349351, -0.047861796, -0.026293226, -0.038138174, -0.0025045017, -0.00791396, 0.077410355, 0.0..."


## UMAP

In [20]:
# i have too many dimesnions on my data!
# i need to reduce them to 2 dimensions
uembs = UMAP(n_neighbors=20, min_dist=0.1).fit_transform(embs)

In [22]:
print(f"""
{uembs.shape}
{type(uembs)}
{uembs.dtype}
{uembs[0]}
""")


(2047, 2)
<class 'numpy.ndarray'>
float32
[5.7860503 0.756128 ]



In [23]:
df["uembs"] = list(uembs)

df.head(3)

Unnamed: 0,title,title_clean,embs,uembs
37281,Officials Warn Largest Hawaii Volcano Eruption Is ‘Imminent’ | NBC Nightly News,officials warn largest hawaii volcano eruption is imminent nbc nightly news,"[0.02621386, 0.036492404, -0.021146122, -0.007447585, 0.05116378, 0.05436051, -0.0042465134, 0.03889002, 0.036423054, 0.010237993, -0.0017715335, -0.041368518, -0.03775023, 0.067658916, -0.0240567...","[5.7860503, 0.756128]"
30905,ZAYN - Let Me (Official Video),zayn let me official video,"[0.018647306, 0.030979455, -0.0071687493, -0.016334603, -0.0033036354, 0.0038727506, -0.04967529, 0.0010558967, -0.07953582, 0.016318323, 0.031500656, -0.0063248854, 0.019711064, 0.1052882, 0.0028...","[13.547275, -1.8228431]"
8197,Amazon Echo Spot review,amazon echo spot review,"[-0.024403362, -0.09118484, -0.008784633, -0.008044011, -0.011623585, -0.032357514, 0.06294571, -0.030349351, -0.047861796, -0.026293226, -0.038138174, -0.0025045017, -0.00791396, 0.077410355, 0.0...","[6.421835, -0.31812882]"


In [24]:
fig = px.scatter(x=uembs[:,0], y=uembs[:,1])

fig.update_layout(width=800, height=800)
fig.update_traces(marker=dict(size=2))

# plotting to show how the embeddings are when just dimensionality reduction is used
fig_show_save(fig, "umap-scatter")

In [28]:
clusters_2d = HDBSCAN(min_cluster_size=10, cluster_selection_method="leaf").fit(uembs)

print(f"""
    Number of clusters: {len(set(clusters_2d.labels_)) - 1}
    Number of rows as outliers: {clusters_2d.labels_.tolist().count(-1)}
""")


    Number of clusters: 48
    Number of rows as outliers: 1040



In [29]:
df["cluster_label"] = clusters_2d.labels_

In [31]:
df.head(3)

Unnamed: 0,title,title_clean,embs,uembs,cluster_label
37281,Officials Warn Largest Hawaii Volcano Eruption Is ‘Imminent’ | NBC Nightly News,officials warn largest hawaii volcano eruption is imminent nbc nightly news,"[0.02621386, 0.036492404, -0.021146122, -0.007447585, 0.05116378, 0.05436051, -0.0042465134, 0.03889002, 0.036423054, 0.010237993, -0.0017715335, -0.041368518, -0.03775023, 0.067658916, -0.0240567...","[5.7860503, 0.756128]",0
30905,ZAYN - Let Me (Official Video),zayn let me official video,"[0.018647306, 0.030979455, -0.0071687493, -0.016334603, -0.0033036354, 0.0038727506, -0.04967529, 0.0010558967, -0.07953582, 0.016318323, 0.031500656, -0.0063248854, 0.019711064, 0.1052882, 0.0028...","[13.547275, -1.8228431]",47
8197,Amazon Echo Spot review,amazon echo spot review,"[-0.024403362, -0.09118484, -0.008784633, -0.008044011, -0.011623585, -0.032357514, 0.06294571, -0.030349351, -0.047861796, -0.026293226, -0.038138174, -0.0025045017, -0.00791396, 0.077410355, 0.0...","[6.421835, -0.31812882]",-1


In [32]:
trace_cluster_2d, trace_outlier_2d = result_tracer_wrapper(uembs, clusters_2d.labels_, df["title_clean"].to_numpy())


col11 = [trace_cluster_2d, trace_outlier_2d]


row1 = [col11]


trace_list = [row1]

fig = subplotter(trace_list, ["Topics by HDBSCAN Cluster", ])

fig_show_save(fig, "topics-by-hdbscan-clusters")