# Cluster Likert Questions

In [1]:
# 03_cluster_likert_questions.ipynb
import pandas as pd
import numpy as np
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot

# Create sample survey data with Likert scale responses
# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions

# Define our questions
questions = [
    'q1_ease_of_use', 
    'q2_product_quality',
    'q3_value_for_money',
    'q4_customer_service',
    'q5_would_recommend',
    'q6_meets_expectations',
    'q7_better_than_competitors',
    'q8_overall_satisfaction'
]

# Define our Likert scale options
likert_options = [
    'Strongly Disagree',
    'Disagree',
    'Neither Agree nor Disagree',
    'Agree',
    'Strongly Agree'
]

# Create DataFrame with 20 respondents
np.random.seed(42)
data = {'respondent_id': range(1, 21)}

# Generate random Likert responses with some patterns
# Group 1 (respondents 1-7): Generally positive
# Group 2 (respondents 8-14): Generally negative
# Group 3 (respondents 15-20): Mixed responses

for q in questions:
    responses = []
    for i in range(1, 21):
        if i <= 7:  # Positive group
            responses.append(np.random.choice(likert_options[2:], p=[0.2, 0.5, 0.3]))
        elif i <= 14:  # Negative group
            responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))
        else:  # Mixed group
            responses.append(np.random.choice(likert_options))
    data[q] = responses

# Create DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original survey data:")
display(df.head())

# Define custom mapping for Likert scale values
custom_mapping = {
    'strongly disagree': -1,
    'disagree': -1,
    'neither agree nor disagree': 0,
    'agree': 1,
    'strongly agree': 1
}


  from tqdm.autonotebook import tqdm, trange
  torch.utils._pytree._register_pytree_node(


Original survey data:


Unnamed: 0,respondent_id,q1_ease_of_use,q2_product_quality,q3_value_for_money,q4_customer_service,q5_would_recommend,q6_meets_expectations,q7_better_than_competitors,q8_overall_satisfaction
0,1,Agree,Agree,Agree,Agree,Neither Agree nor Disagree,Strongly Agree,Agree,Neither Agree nor Disagree
1,2,Strongly Agree,Agree,Neither Agree nor Disagree,Strongly Agree,Strongly Agree,Agree,Strongly Agree,Agree
2,3,Strongly Agree,Agree,Agree,Neither Agree nor Disagree,Strongly Agree,Strongly Agree,Agree,Agree
3,4,Agree,Neither Agree nor Disagree,Agree,Neither Agree nor Disagree,Agree,Agree,Agree,Agree
4,5,Neither Agree nor Disagree,Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Strongly Agree,Agree,Neither Agree nor Disagree,Neither Agree nor Disagree


In [2]:

# Use pandas method chaining to process the data
df_processed = (df
    # Encode Likert scale responses
    .encode_likert(likert_columns=questions, custom_mapping=custom_mapping)
    # Cluster the questions
    .cluster_questions(
        columns=questions, 
        likert_mapping=custom_mapping,
        umap_n_neighbors=10,
        hdbscan_min_cluster_size=2
    )
)

# Get the list of encoded Likert columns
likert_columns_with_prefix = [f"likert_encoded_{q}" for q in questions]

# Display encoded data
print("\nEncoded Likert data:")
display(df_processed[['respondent_id'] + likert_columns_with_prefix].head())

# Display clustering results
print("\nQuestion clustering results:")
display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head())


# Use the cluster_heatmap_plot function to visualize cluster patterns
print("\nCluster heatmap showing the sentiment distribution across questions:")
heatmap = cluster_heatmap_plot(
    df=df_processed,
    x="question_cluster_id",  # Cluster IDs as the x-axis
    y=likert_columns_with_prefix,  # Encoded Likert columns to analyze
    max_width=30  # For better readability
)

# Display the heatmap
display(heatmap)

# Let's also add a simple interpretation of the clusters
cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean()
print("\nCluster averages for each question:")
display(cluster_summary)

# Calculate respondent counts per cluster
cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
display(cluster_counts)

Using custom mapping: {'strongly disagree': -1, 'disagree': -1, 'neither agree nor disagree': 0, 'agree': 1, 'strongly agree': 1}
NaN: NaN values are preserved
  Agree -> 1: 39 times
  Strongly Agree -> 1: 19 times
  Neither Agree nor Disagree -> 0: 37 times
  Disagree -> -1: 36 times
  Strongly Disagree -> -1: 29 times
Using custom mapping: {'strongly disagree': -1, 'disagree': -1, 'neither agree nor disagree': 0, 'agree': 1, 'strongly agree': 1}
NaN: NaN values are preserved
  Agree -> 1: 39 times
  Strongly Agree -> 1: 19 times
  Neither Agree nor Disagree -> 0: 37 times
  Disagree -> -1: 36 times
  Strongly Disagree -> -1: 29 times

Encoded Likert data:


Unnamed: 0,respondent_id,likert_encoded_q1_ease_of_use,likert_encoded_q2_product_quality,likert_encoded_q3_value_for_money,likert_encoded_q4_customer_service,likert_encoded_q5_would_recommend,likert_encoded_q6_meets_expectations,likert_encoded_q7_better_than_competitors,likert_encoded_q8_overall_satisfaction
0,1,1,1,1,1,0,1,1,0
1,2,1,1,0,1,1,1,1,1
2,3,1,1,1,0,1,1,1,1
3,4,1,0,1,0,1,1,1,1
4,5,0,1,0,0,1,1,0,0



Question clustering results:


Unnamed: 0,respondent_id,question_cluster_id,question_cluster_probability
0,1,0.0,1.0
1,2,0.0,1.0
2,3,0.0,1.0
3,4,0.0,1.0
4,5,0.0,1.0



Cluster heatmap showing the sentiment distribution across questions:



Cluster averages for each question:


Unnamed: 0_level_0,likert_encoded_q1_ease_of_use,likert_encoded_q2_product_quality,likert_encoded_q3_value_for_money,likert_encoded_q4_customer_service,likert_encoded_q5_would_recommend,likert_encoded_q6_meets_expectations,likert_encoded_q7_better_than_competitors,likert_encoded_q8_overall_satisfaction
question_cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0.625,0.875,0.375,0.375,0.75,0.875,0.875,0.75
1.0,-0.25,-0.833333,-0.583333,-0.583333,-0.5,-0.25,-0.5,-0.75



Number of respondents in each cluster:


question_cluster_id
0.0     8
1.0    12
Name: count, dtype: int64