# Cluster Likert Questions

In [12]:
# 03_cluster_likert_questions.ipynb
import pandas as pd
import numpy as np
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot

# Create sample survey data with Likert scale responses
# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions

# Define our questions
questions = [
    'q1_ease_of_use', 
    'q2_product_quality',
    'q3_value_for_money',
    'q4_customer_service',
    'q5_would_recommend',
    'q6_meets_expectations',
    'q7_better_than_competitors',
    'q8_overall_satisfaction'
]

# Define our Likert scale options
likert_options = [
    'Strongly Disagree',
    'Disagree',
    'Neither Agree nor Disagree',
    'Agree',
    'Strongly Agree'
]

POPULATION = 200
# Create DataFrame with 20 respondents
np.random.seed(42)
data = {'respondent_id': range(1, POPULATION)}

# Generate random Likert responses with some patterns
# Group 1 (respondents 1-7): Generally positive
# Group 2 (respondents 8-14): Generally negative
# Group 3 (respondents 15-20): Mixed responses

for q in questions:
    responses = []
    for i in range(1, POPULATION):
        if i <= (0.3 * POPULATION):  # Positive group
            responses.append(np.random.choice(likert_options[2:], p=[0.2, 0.5, 0.3]))
        elif i <= (0.6 * POPULATION):  # Negative group
            responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))
        else:  # Don't care group
            responses.append(np.random.choice(likert_options[1:4], p=[0.1,0.8,0.1]))
    data[q] = responses

# Create DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original survey data:")
display(df.head())

# Define custom mapping for Likert scale values
custom_mapping = {
    'strongly disagree': -1,
    'disagree': -1,
    'neither agree nor disagree': 0,
    'agree': 1,
    'strongly agree': 1
}


Original survey data:


Unnamed: 0,respondent_id,q1_ease_of_use,q2_product_quality,q3_value_for_money,q4_customer_service,q5_would_recommend,q6_meets_expectations,q7_better_than_competitors,q8_overall_satisfaction
0,1,Agree,Strongly Agree,Strongly Agree,Neither Agree nor Disagree,Strongly Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Agree
1,2,Strongly Agree,Agree,Strongly Agree,Neither Agree nor Disagree,Agree,Strongly Agree,Strongly Agree,Agree
2,3,Strongly Agree,Neither Agree nor Disagree,Neither Agree nor Disagree,Neither Agree nor Disagree,Agree,Neither Agree nor Disagree,Strongly Agree,Strongly Agree
3,4,Agree,Neither Agree nor Disagree,Strongly Agree,Neither Agree nor Disagree,Strongly Agree,Strongly Agree,Strongly Agree,Agree
4,5,Neither Agree nor Disagree,Strongly Agree,Agree,Agree,Strongly Agree,Agree,Strongly Agree,Agree


In [18]:

# Use pandas method chaining to process the data
df_processed = (df
    # Cluster the questions
    .cluster_questions(
        columns=questions, 
        #likert_mapping=custom_mapping, default handles most cases
        umap_n_neighbors=15,
        hdbscan_min_cluster_size=15,
        cluster_selection_epsilon=0.35,
        
    )
)

# Get the list of encoded Likert columns
likert_columns_with_prefix = [f"likert_encoded_{q}" for q in questions]

# Display encoded data
print("\nEncoded Likert data:")
display(df_processed[['respondent_id'] + likert_columns_with_prefix].head())

# Display clustering results
print("\nQuestion clustering results:")
display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head())


# Use the cluster_heatmap_plot function to visualize cluster patterns
print("\nCluster heatmap showing the sentiment distribution across questions:")
heatmap = cluster_heatmap_plot(
    df=df_processed,
    x="question_cluster_id",  # Cluster IDs as the x-axis
    y=likert_columns_with_prefix,  # Encoded Likert columns to analyze
    max_width=30  # For better readability
)

# Display the heatmap
display(heatmap)

# Let's also add a simple interpretation of the clusters
cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean()
print("\nCluster averages for each question:")
display(cluster_summary)

# Calculate respondent counts per cluster
cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
display(cluster_counts)

Using default mapping:
-1: Phrases containing 'disagree', 'do not agree', etc.
 0: Phrases containing 'neutral', 'neither', 'unsure', etc.
+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')
NaN: NaN values are preserved
  Agree -> 1: 275 times
  Strongly Agree -> 1: 150 times
  Neither Agree nor Disagree -> 0: 724 times
  Disagree -> -1: 293 times
  Strongly Disagree -> -1: 150 times

Encoded Likert data:


Unnamed: 0,respondent_id,likert_encoded_q1_ease_of_use,likert_encoded_q2_product_quality,likert_encoded_q3_value_for_money,likert_encoded_q4_customer_service,likert_encoded_q5_would_recommend,likert_encoded_q6_meets_expectations,likert_encoded_q7_better_than_competitors,likert_encoded_q8_overall_satisfaction
0,1,1,1,1,0,1,0,0,1
1,2,1,1,1,0,1,1,1,1
2,3,1,0,0,0,1,0,1,1
3,4,1,0,1,0,1,1,1,1
4,5,0,1,1,1,1,1,1,1



Question clustering results:


Unnamed: 0,respondent_id,question_cluster_id,question_cluster_probability
0,1,1.0,1.0
1,2,1.0,1.0
2,3,1.0,1.0
3,4,1.0,1.0
4,5,1.0,1.0



Cluster heatmap showing the sentiment distribution across questions:



Cluster averages for each question:


Unnamed: 0_level_0,likert_encoded_q1_ease_of_use,likert_encoded_q2_product_quality,likert_encoded_q3_value_for_money,likert_encoded_q4_customer_service,likert_encoded_q5_would_recommend,likert_encoded_q6_meets_expectations,likert_encoded_q7_better_than_competitors,likert_encoded_q8_overall_satisfaction
question_cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,-0.716418,-0.761194,-0.61194,-0.716418,-0.895522,-0.701493,-0.761194,-0.761194
1.0,0.373913,0.391304,0.443478,0.391304,0.452174,0.391304,0.443478,0.408696
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Number of respondents in each cluster:


question_cluster_id
0.0     67
1.0    115
2.0     17
Name: count, dtype: int64