# Cluster Free Text Comments

In [1]:
# 02_cluster_comments.ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot

# Create sample survey data with open-ended comments about a product
data = {
    'respondent_id': range(1, 21),
    'comments': [
        "Battery life is excellent, lasts all day",
        "The battery doesn't last long enough for me",
        "Battery performance is outstanding, very impressed",
        "Screen resolution is incredible, so sharp and clear",
        "Love the high-resolution display, colors are vibrant",
        "The screen is too reflective in bright light",
        "Camera quality is excellent for the price range",
        "Photos taken in low light are grainy and poor quality",
        "Camera autofocus is slow and often misses the shot",
        "The software is intuitive and easy to use",
        "User interface is confusing and not user-friendly",
        "Software keeps crashing when I open multiple apps",
        "Build quality feels premium and solid",
        "The device feels flimsy and cheaply made",
        "Very durable, survived several drops without damage",
        "Excellent value for money considering the features",
        "Overpriced for what you get compared to competitors",
        "Worth every penny, exceeded my expectations",
        "Customer service was unhelpful when I had issues",
        "Great customer support, quick and helpful responses"
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original data:")
display(df)


  from tqdm.autonotebook import tqdm, trange
  torch.utils._pytree._register_pytree_node(


Original data:


Unnamed: 0,respondent_id,comments
0,1,"Battery life is excellent, lasts all day"
1,2,The battery doesn't last long enough for me
2,3,"Battery performance is outstanding, very impre..."
3,4,"Screen resolution is incredible, so sharp and ..."
4,5,"Love the high-resolution display, colors are v..."
5,6,The screen is too reflective in bright light
6,7,Camera quality is excellent for the price range
7,8,Photos taken in low light are grainy and poor ...
8,9,Camera autofocus is slow and often misses the ...
9,10,The software is intuitive and easy to use


In [2]:

# Cluster the comments
df_clustered = df.cluster_comments(input_column='comments', min_cluster_size=3, n_neighbors=5, cluster_selection_epsilon=0.5)

# Examine the clusters
print("\nComment clusters:")
display(df_clustered[['comments', 'cluster', 'cluster_probability']].sort_values('cluster'))

# Count comments per cluster
cluster_counts = df_clustered['cluster'].value_counts().reset_index()
cluster_counts.columns = ['cluster', 'count']
print("\nComments per cluster:")
display(cluster_counts)

  torch.utils._pytree._register_pytree_node(



Comment clusters:


Unnamed: 0,comments,cluster,cluster_probability
19,"Great customer support, quick and helpful resp...",-1.0,0.0
18,Customer service was unhelpful when I had issues,0.0,0.667263
0,"Battery life is excellent, lasts all day",0.0,1.0
2,"Battery performance is outstanding, very impre...",0.0,1.0
1,The battery doesn't last long enough for me,0.0,1.0
11,Software keeps crashing when I open multiple apps,1.0,1.0
10,User interface is confusing and not user-friendly,1.0,1.0
9,The software is intuitive and easy to use,1.0,1.0
12,Build quality feels premium and solid,2.0,1.0
13,The device feels flimsy and cheaply made,2.0,0.83056



Comments per cluster:


Unnamed: 0,cluster,count
0,3.0,6
1,2.0,6
2,0.0,4
3,1.0,3
4,-1.0,1


In [3]:
df_clustered.head()

Unnamed: 0,respondent_id,comments,sentence_embedding,umap_x,umap_y,cluster,cluster_probability
0,1,"Battery life is excellent, lasts all day","[-0.038631026, 0.044625234, -0.028667396, -0.0...",11.356366,4.052678,0.0,1.0
1,2,The battery doesn't last long enough for me,"[-0.0007719228, -0.0042446144, 0.011075384, -0...",11.582358,3.586939,0.0,1.0
2,3,"Battery performance is outstanding, very impre...","[-0.008022247, 0.09049879, -0.0867905, -0.0022...",11.752824,4.191682,0.0,1.0
3,4,"Screen resolution is incredible, so sharp and ...","[-0.014808243, -0.03135826, 0.035538964, -0.05...",13.497684,2.565261,3.0,1.0
4,5,"Love the high-resolution display, colors are v...","[-0.029058423, 0.026945723, 0.040125024, -0.05...",13.295995,2.067326,3.0,1.0


You can see on the datamapplot that similar comments are closer together. By varying the cluster_epsilon you can tweak the number of clusters (clustering works better on much larger datasets)

In [4]:
import datamapplot

datamapplot.create_interactive_plot(df_clustered[['umap_x', 'umap_y']].values, df_clustered['cluster'].astype(str).values, hover_text=df_clustered['comments'])