In [None]:
!pip install datamapplot -q
!pip install scikit-learn -q
!pip install pyarrow -q
!pip install sentence_transformers -q
!pip install numpy==1.26.4 -q
!pip install pandas -q
!pip install --upgrade scipy
!pip install nltk -q

In [None]:
import numpy as np
import pandas as pd
import datamapplot
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.colors as mcolors
import re
import json
from nltk.corpus import stopwords
import nltk
from sentence_transformers import SentenceTransformer
import time

### Step 1: Generating Sentence Embeddings with Sentence Transformers

This script uses the `SentenceTransformers` library to generate sentence embeddings from any type of text. Here, we apply it to interview transcripts, enabling us to analyze how interviews relate based on their content. The text is transformed into numerical embeddings that capture semantic meaning, making them ideal for tasks like similarity detection or data clustering.

In this script, the default `SentenceTransformer` model is **paraphrase-multilingual-MiniLM-L12-v2**, which is trained on many different languages. However, you can easily change it to suit your needs. Here is a list of different [pre-trained models](https://sbert.net/docs/sentence_transformer/pretrained_models.html).


In [None]:

# Step 1: Load the CSV OR XLSX file
df = pd.read_csv('Text-File.csv') # Your csv file name
#df = pd.read_execel ('TEXT.xlsx')

# Step 2: Prepare the text data
text_data = df['Text'].tolist()

# Step 3: Generate embeddings with progress bar and timer
#model = SentenceTransformer('all-MiniLM-L6-v2') ## English model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') ## Multilingual model
# Step 3: Generate embeddings
start_time = time.time()

embeddings = model.encode(text_data, show_progress_bar=True, batch_size=32)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Embeddings generated in {elapsed_time:.2f} seconds.")

### Step 2: Visualizing Sentence Embeddings with DataMapPlot

Once the sentence embeddings are generated, the next step is to visualize them. This script uses [DataMapPlot](https://datamapplot.readthedocs.io/en/latest/) to create an interactive plot, showing how the embeddings are clustered and organized in a 2D space.

### Visualization 1: DataMapPlot without Labels

This first visualization is created without any labels. It shows how different clusters are formed based on the semantic similarity between interviews. The goal here is to explore the clusters and start identifying patterns in the data, without the influence of labels. Once we have a clear view of these clusters, we can move on to annotating them for further analysis.

In [None]:

# Ensure embeddings are in NumPy array format
embeddings = np.array(embeddings)

# Number of clusters
n_clusters = 10  # You can adjust this as needed

# Step 1: Create a data map using t-SNE
n_samples = embeddings.shape[0]
perplexity = min(30, (n_samples - 1) // 3)

tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
data_map = tsne.fit_transform(embeddings)

# Step 2: Perform KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels_int = kmeans.fit_predict(data_map)

# Assign generic cluster names
label_topic_map = {label: f"Cluster {label + 1}" for label in range(n_clusters)}
labels_topic = np.array([label_topic_map.get(label, "Unknown") for label in labels_int])

# Step 3: Prepare hover text
hover_text = df['Text'].astype(str).tolist()

# Step 4: Create a color palette
color_palette = list(mcolors.TABLEAU_COLORS.values())

# Step 5: Generate marker colors
unique_labels = np.unique(labels_topic)
color_mapping = {label: color_palette[i % len(color_palette)] for i, label in enumerate(unique_labels)}
marker_color_array = [color_mapping[label] for label in labels_topic]

# Step 6: Set marker sizes
marker_size_array = df['Text'].str.len().values.astype(np.float32)
min_size, max_size = 5, 15
# Normalize marker sizes between min_size and max_size
if marker_size_array.max() != marker_size_array.min():
    marker_size_array = min_size + (max_size - min_size) * (
        (marker_size_array - marker_size_array.min()) / (marker_size_array.max() - marker_size_array.min())
    )
else:
    marker_size_array = np.full_like(marker_size_array, (min_size + max_size) / 2)

# Step 7: Set point radius min and max pixels
point_radius_min_pixels = 2
point_radius_max_pixels = 10

# Create the interactive plot
try:
    plot = datamapplot.create_interactive_plot(
        data_map,
        labels_topic,  # Use the generic cluster names
        hover_text=hover_text,
        font_family="Merriweather",
        title="Interviews",
        sub_title="Interactive plot with Generic Cluster Names",
        enable_search=True,
        darkmode=True,
        marker_color_array=marker_color_array,
        marker_size_array=marker_size_array,
        point_radius_min_pixels=point_radius_min_pixels,
        point_radius_max_pixels=point_radius_max_pixels,
        point_line_width=0,
        cluster_boundary_polygons=False,  # Disable if not needed
        cluster_boundary_line_width=2,
    )

    # Save the plot to an HTML file
    plot.save("Interviews: Clusters_Generic_Names.html")
    print("Plot with generic cluster names saved successfully.")
except Exception as e:
    print(f"Error creating or saving the plot: {e}")


### Visualization 2: DataMapPlot with User Annotation Labels

In [None]:

def get_user_defined_names(n_clusters):
    """
    Prompt user to input names for each cluster without any suggestions.
    """
    print(f"\nPlease provide names for each of the {n_clusters} clusters.")
    cluster_names = []
    for i in range(n_clusters):a
        user_input = input(f"Enter name for Cluster {i + 1}: ").strip()
        if user_input:
            cluster_names.append(user_input)
        else:
            cluster_names.append(f"Cluster {i + 1}")
    return cluster_names

def save_cluster_names(cluster_names, filename="cluster_names.json"):
    """
    Save the cluster names to a JSON file.
    """
    with open(filename, 'w') as f:
        json.dump(cluster_names, f, indent=4)
    print(f"\nCluster names saved to {filename}.")

# Ensure embeddings are in NumPy array format
embeddings = np.array(embeddings)

# Number of clusters
n_clusters = 10  # Adjust as needed

# Step 1: Create a data map using t-SNE
n_samples = embeddings.shape[0]
perplexity = min(30, (n_samples - 1) // 3)

tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
data_map = tsne.fit_transform(embeddings)

# Step 2: Perform KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels_int = kmeans.fit_predict(data_map)

# Step 3: Get user-defined cluster names without suggestions
cluster_names = get_user_defined_names(n_clusters)

label_topic_map = {label: cluster_names[label] for label in range(n_clusters)}
labels_topic = np.array([label_topic_map.get(label, "Unknown") for label in labels_int])

# Step 4: Prepare hover text
hover_text = df['Text'].astype(str).tolist()

# Step 5: Create a color palette
color_palette = list(mcolors.TABLEAU_COLORS.values())

# Step 6: Generate marker colors
unique_labels = np.unique(labels_topic)
color_mapping = {label: color_palette[i % len(color_palette)] for i, label in enumerate(unique_labels)}
marker_color_array = [color_mapping[label] for label in labels_topic]

# Step 7: Set marker sizes
marker_size_array = df['Text'].str.len().values.astype(np.float32)
min_size, max_size = 5, 15
# Normalize marker sizes between min_size and max_size
if marker_size_array.max() != marker_size_array.min():
    marker_size_array = min_size + (max_size - min_size) * (
        (marker_size_array - marker_size_array.min()) / (marker_size_array.max() - marker_size_array.min())
    )
else:
    marker_size_array = np.full_like(marker_size_array, (min_size + max_size) / 2)

# Step 8: Set point radius min and max pixels
point_radius_min_pixels = 2
point_radius_max_pixels = 10

# Step 9: Create the interactive plot
try:
    plot = datamapplot.create_interactive_plot(
        data_map,
        labels_topic,  # Use the user-defined cluster names
        hover_text=hover_text,
        font_family="Merriweather",
        title="Interview Data Map",
        sub_title="Interactive plot with User-Defined Cluster Names",
        enable_search=True,
        darkmode=True,
        marker_color_array=marker_color_array,
        marker_size_array=marker_size_array,
        point_radius_min_pixels=point_radius_min_pixels,
        point_radius_max_pixels=point_radius_max_pixels,
        point_line_width=0,
        cluster_boundary_polygons=False,  # Disable if not needed
        cluster_boundary_line_width=2,
    )

    # Save the plot to an HTML file
    plot_filename = "Clusters_User_Defined_Names.html"
    plot.save(plot_filename)
    print(f"Plot with user-defined cluster names saved successfully as {plot_filename}.")

except Exception as e:
    print(f"Error creating or saving the plot: {e}")


### Visualization 3: Automating Labels of Clusters (TF-IDF)

In this example, we demonstrate how machine learning can be used to automatically label the different clusters. Here, we use TF-IDF (Term Frequency-Inverse Document Frequency), a statistical method that evaluates how frequently a word appears in a document versus how commonly it appears across all documents. This is one of the simplest ways to name clusters, making it efficient for use without requiring significant computational power.

For more advanced methods, you could consider using [BERTopic](https://maartengr.github.io/BERTopic/index.html), or even combining BERTopic with an open-source generative language model, which can be run through tools like [Ollama](https://ollama.com/), to generate more nuanced automated cluster labels.


In [None]:
nltk.download('stopwords', quiet=True)

# Define stop words for different languages
stop_words = {
    'english': stopwords.words('english'),
    'danish': stopwords.words('danish'),
    'german': stopwords.words('german')
}

def get_cluster_topic(cluster_texts, language='english', n_terms=5):
    cluster_texts = [text for text in cluster_texts if isinstance(text, str) and text.strip()]
    if not cluster_texts:
        return []

    vectorizer = TfidfVectorizer(
        stop_words=stop_words[language],
        max_features=1000,
        ngram_range=(2, 3)  # Use bigrams and trigrams
    )
    try:
        X = vectorizer.fit_transform(cluster_texts)
        if X.shape[1] == 0:
            return []

        tf_idf_sum = X.sum(axis=0).A1  # Sum TF-IDF scores across all documents
        terms = vectorizer.get_feature_names_out()

        top_indices = tf_idf_sum.argsort()[::-1]
        top_terms = [terms[i] for i in top_indices]

        return top_terms
    except Exception as e:
        print(f"Error in get_cluster_topic: {e}")
        return []

# Ensure embeddings are in NumPy array format
embeddings = np.array(embeddings)

# Number of samples
n_samples = embeddings.shape[0]

# Adjust perplexity based on the number of samples
perplexity = min(30, (n_samples - 1) // 3)

# Step 1: Create a data map using t-SNE
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
data_map = tsne.fit_transform(embeddings)

# Step 2: Perform hierarchical clustering
n_clusters_list = [2, 5, 10]  # Adjust these numbers for your desired hierarchy levels
labels_layers = []

for n_clusters in n_clusters_list:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels_int = kmeans.fit_predict(data_map)

    used_topics = set()

    # Generate topic names for each cluster
    label_topic_map = {}
    for label in range(n_clusters):
        indices = np.where(labels_int == label)[0]
        if len(indices) == 0:
            label_topic_map[label] = f"{label}: No data"
            continue
        cluster_texts = df['Text'].iloc[indices].astype(str).tolist()
        top_terms = get_cluster_topic(cluster_texts, language='english', n_terms=5)

        # Select the first unused term as the topic name
        topic_name = None
        for term in top_terms:
            if term not in used_topics:
                topic_name = term
                used_topics.add(term)
                break

        if topic_name is None:
            # All terms have been used; default to the highest scoring term with cluster label
            topic_name = f"{top_terms[0]} {label}" if top_terms else f"Cluster {label}"

        label_topic_map[label] = f"{label}: {topic_name}"

    # Convert integer labels to topic names
    labels_topic = np.array([label_topic_map.get(label, f"{label}: Unknown") for label in labels_int])
    labels_layers.append(labels_topic)

# Step 3: Prepare hover text
hover_text = df['Text'].astype(str).tolist()

# Step 4: Create a color palette
color_palette = list(mcolors.TABLEAU_COLORS.values())

# Step 5: Generate marker colors using the last layer of labels
labels = labels_layers[-1]  # Use the last layer for coloring

# Create a color mapping
unique_labels = np.unique(labels)
color_mapping = {label: color_palette[i % len(color_palette)] for i, label in enumerate(unique_labels)}

# Generate marker colors
marker_color_array = [color_mapping[label] for label in labels]

# Step 6: Set marker sizes
marker_size_array = df['Text'].str.len().values.astype(np.float32)
min_size, max_size = 5, 15
# Normalize marker sizes between min_size and max_size
if marker_size_array.max() != marker_size_array.min():
    marker_size_array = min_size + (max_size - min_size) * (
        (marker_size_array - marker_size_array.min()) / (marker_size_array.max() - marker_size_array.min())
    )
else:
    marker_size_array = np.full_like(marker_size_array, (min_size + max_size) / 2)

# Step 7: Set point radius min and max pixels
point_radius_min_pixels = 2
point_radius_max_pixels = 10

# Create the interactive plot
try:
    plot = datamapplot.create_interactive_plot(
        data_map,
        *labels_layers,  # Use the labels with topic names
        hover_text=hover_text,
        font_family="Merriweather",
        title="Interviews",
        sub_title="Interactive plot of Interviews",
        enable_search=True,
        darkmode=True,
        marker_color_array=marker_color_array,
        marker_size_array=marker_size_array,
        point_radius_min_pixels=point_radius_min_pixels,
        point_radius_max_pixels=point_radius_max_pixels,
        point_line_width=0,
        cluster_boundary_polygons=False,
        cluster_boundary_line_width=2,
    )

    # Save the plot to an HTML file
    plot.save("Interviews_TF_IDF.html")
    print("Plot saved successfully.")
except Exception as e:
    print(f"Error creating or displaying the plot: {e}")
