# Part 4: Summarize Results and Generate Final Output

This notebook analyzes the clustered data, generates summaries, and creates final outputs.

## Input
- `clustered_data.pickle`: Clustered data from Part 3

## Output
- `final_analysis_results.csv`: Complete results with cluster assignments
- `cluster_summary_report.txt`: Detailed analysis report
- Word clouds and visualizations

In [None]:
# Install required packages for analysis and visualization
!pip install pandas numpy matplotlib seaborn plotly textblob wordcloud

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pickle
from textblob import TextBlob
from wordcloud import WordCloud
from collections import Counter

print("✅ Libraries imported successfully!")

In [None]:
# Load clustered data from Part 3
print("📥 Loading clustered data from Part 3...")

try:
    with open('clustered_data.pickle', 'rb') as f:
        data = pickle.load(f)
    
    df = data['dataframe']
    cluster_labels = data['cluster_labels']
    clusterer = data['clusterer']
    processed_texts = data['processed_texts']
    original_texts = data['original_texts']
    vectorizer = data['vectorizer']
    X_dense = data['tfidf_matrix']
    text_columns = data['text_columns']
    feature_names = data['feature_names']
    n_clusters = data['n_clusters']
    n_noise = data['n_noise']
    cluster_counts = data['cluster_counts']
    
    print(f"✅ Data loaded successfully!")
    print(f"📊 Total responses: {len(df)}")
    print(f"🏷️  Clusters found: {n_clusters}")
    print(f"🔇 Noise points: {n_noise}")
    
except FileNotFoundError:
    print("❌ Clustered data file not found. Please run Part 3 first!")
    raise

print(f"\n📊 Cluster distribution:")
for cluster_id, count in cluster_counts.items():
    if cluster_id == -1:
        print(f"   🔇 Noise: {count} responses")
    else:
        print(f"   🏷️  Cluster {cluster_id}: {count} responses")

In [None]:
def analyze_cluster(cluster_df, cluster_id, vectorizer, X_dense):
    """Analyze a specific cluster and generate insights"""
    if cluster_id == -1:
        return {"cluster_id": -1, "name": "Noise", "size": len(cluster_df)}
    
    cluster_indices = cluster_df.index.tolist()
    cluster_texts = [processed_texts[i] for i in cluster_indices]
    cluster_original = [original_texts[i] for i in cluster_indices]
    
    # Get most important terms for this cluster
    cluster_vectors = X_dense[cluster_indices]
    mean_tfidf = np.mean(cluster_vectors, axis=0)
    
    # Get top terms
    top_indices = np.argsort(mean_tfidf)[-10:][::-1]
    top_terms = [(feature_names[i], mean_tfidf[i]) for i in top_indices if mean_tfidf[i] > 0]
    
    # Sentiment analysis
    sentiments = [TextBlob(text).sentiment.polarity for text in cluster_original]
    avg_sentiment = np.mean(sentiments)
    
    # Response length statistics
    lengths = [len(text.split()) for text in cluster_original]
    
    return {
        "cluster_id": cluster_id,
        "size": len(cluster_df),
        "top_terms": top_terms,
        "avg_sentiment": avg_sentiment,
        "avg_length": np.mean(lengths),
        "sample_responses": cluster_original[:3],  # First 3 responses as examples
        "processed_sample": cluster_texts[:3],
        "all_sentiments": sentiments
    }

print("🔧 Analysis functions defined!")

In [None]:
# Analyze each cluster
print("🔍 Analyzing clusters...")

cluster_analyses = []

for cluster_id in sorted(set(cluster_labels)):
    cluster_mask = df['cluster'] == cluster_id
    cluster_data = df[cluster_mask]
    
    analysis = analyze_cluster(cluster_data, cluster_id, vectorizer, X_dense)
    cluster_analyses.append(analysis)
    
    print(f"\n{'='*60}")
    print(f"🏷️  CLUSTER {cluster_id} ANALYSIS")
    print(f"{'='*60}")
    
    if cluster_id == -1:
        print(f"📋 Type: Noise/Outliers")
        print(f"📊 Size: {analysis['size']} responses")
    else:
        print(f"📊 Size: {analysis['size']} responses ({analysis['size']/len(df)*100:.1f}%)")
        
        sentiment_label = ('😊 Positive' if analysis['avg_sentiment'] > 0.1 else 
                          '😞 Negative' if analysis['avg_sentiment'] < -0.1 else 
                          '😐 Neutral')
        print(f"💭 Average Sentiment: {analysis['avg_sentiment']:.3f} ({sentiment_label})")
        print(f"📝 Average Response Length: {analysis['avg_length']:.1f} words")
        
        print(f"\n🔤 Top Terms:")
        for i, (term, score) in enumerate(analysis['top_terms'][:5]):
            print(f"   {i+1}. {term}: {score:.3f}")
        
        print(f"\n💬 Sample Responses:")
        for i, response in enumerate(analysis['sample_responses']):
            print(f"   {i+1}. {response[:120]}{'...' if len(response) > 120 else ''}")

# Create summary DataFrame
summary_data = []
for analysis in cluster_analyses:
    if analysis['cluster_id'] != -1:
        sentiment_label = ('Positive' if analysis['avg_sentiment'] > 0.1 else 
                          'Negative' if analysis['avg_sentiment'] < -0.1 else 
                          'Neutral')
        
        summary_data.append({
            'Cluster_ID': analysis['cluster_id'],
            'Size': analysis['size'],
            'Percentage': f"{analysis['size']/len(df)*100:.1f}%",
            'Avg_Sentiment': round(analysis['avg_sentiment'], 3),
            'Sentiment_Label': sentiment_label,
            'Avg_Length': round(analysis['avg_length'], 1),
            'Top_3_Terms': ', '.join([term for term, _ in analysis['top_terms'][:3]])
        })

summary_df = pd.DataFrame(summary_data)

print(f"\n{'='*60}")
print("📊 CLUSTER SUMMARY TABLE")
print(f"{'='*60}")
if len(summary_df) > 0:
    display(summary_df)
else:
    print("⚠️ No clusters found for summary table")

In [None]:
# Generate word clouds for each cluster
if n_clusters > 0:
    print("☁️ Generating word clouds for clusters...")
    
    n_clusters_to_show = min(n_clusters, 6)  # Show up to 6 clusters
    n_cols = min(3, n_clusters_to_show)
    n_rows = (n_clusters_to_show + n_cols - 1) // n_cols
    
    if n_clusters_to_show > 0:
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        
        # Handle single subplot case
        if n_clusters_to_show == 1:
            axes = [axes]
        elif n_rows == 1:
            axes = [axes] if n_cols == 1 else axes
        else:
            axes = axes.flatten()
        
        cluster_counter = 0
        
        for analysis in cluster_analyses:
            if analysis['cluster_id'] != -1 and cluster_counter < n_clusters_to_show:
                cluster_id = analysis['cluster_id']
                cluster_mask = df['cluster'] == cluster_id
                
                # Get processed texts for this cluster
                cluster_indices = df[cluster_mask].index.tolist()
                cluster_texts = [processed_texts[i] for i in cluster_indices]
                cluster_text = ' '.join(cluster_texts)
                
                if cluster_text.strip():
                    try:
                        wordcloud = WordCloud(
                            width=400, 
                            height=300, 
                            background_color='white',
                            max_words=50,
                            colormap='viridis'
                        ).generate(cluster_text)
                        
                        axes[cluster_counter].imshow(wordcloud, interpolation='bilinear')
                        axes[cluster_counter].set_title(f'Cluster {cluster_id} Word Cloud\n({analysis["size"]} responses)')
                        axes[cluster_counter].axis('off')
                    except ValueError:
                        axes[cluster_counter].text(0.5, 0.5, f'Cluster {cluster_id}\nInsufficient text', 
                                                 ha='center', va='center', transform=axes[cluster_counter].transAxes)
                        axes[cluster_counter].set_title(f'Cluster {cluster_id}')
                        axes[cluster_counter].axis('off')
                
                cluster_counter += 1
        
        # Hide unused subplots
        for i in range(cluster_counter, len(axes)):
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.show()
        print("✅ Word clouds generated!")
    
else:
    print("⚠️ No clusters available for word cloud generation")

In [None]:
# Export final results
print("💾 Exporting final results...")

# Create enhanced DataFrame with all analysis results
export_df = df.copy()

# Add cluster names based on top terms
cluster_names = {}
for analysis in cluster_analyses:
    if analysis['cluster_id'] != -1:
        top_terms = [term for term, _ in analysis['top_terms'][:2]]
        cluster_names[analysis['cluster_id']] = f"Cluster_{analysis['cluster_id']}_{'_'.join(top_terms)}"
    else:
        cluster_names[analysis['cluster_id']] = "Noise"

export_df['cluster_name'] = export_df['cluster'].map(cluster_names)

# Add sentiment scores and other metrics
export_df['sentiment_score'] = [TextBlob(text).sentiment.polarity for text in original_texts]
export_df['response_length'] = [len(text.split()) for text in original_texts]
export_df['original_combined_text'] = original_texts
export_df['processed_text'] = processed_texts

# Export detailed results to CSV
output_filename = 'final_analysis_results.csv'
export_df.to_csv(output_filename, index=False)
print(f"✅ Results exported to: {output_filename}")

# Create comprehensive summary report
report = f"""
GOOGLE FORM CLUSTERING ANALYSIS - FINAL REPORT
=============================================

📊 DATASET OVERVIEW
===================
Total responses analyzed: {len(export_df)}
Number of clusters found: {n_clusters}
Noise points: {n_noise} ({n_noise/len(cluster_labels)*100:.1f}%)

📈 CLUSTERING QUALITY
====================
"""

# Add silhouette score if available
if n_clusters > 1:
    from sklearn.metrics import silhouette_score
    non_noise_mask = cluster_labels != -1
    if np.sum(non_noise_mask) > 1:
        silhouette_avg = silhouette_score(X_dense[non_noise_mask], 
                                        cluster_labels[non_noise_mask])
        report += f"Silhouette Score: {silhouette_avg:.3f}\n"

report += f"""

🏷️ DETAILED CLUSTER ANALYSIS
============================
"""

for analysis in cluster_analyses:
    if analysis['cluster_id'] != -1:
        sentiment_label = ('Positive' if analysis['avg_sentiment'] > 0.1 else 
                          'Negative' if analysis['avg_sentiment'] < -0.1 else 
                          'Neutral')
        
        report += f"""
Cluster {analysis['cluster_id']}:
- Size: {analysis['size']} responses ({analysis['size']/len(export_df)*100:.1f}%)
- Average sentiment: {analysis['avg_sentiment']:.3f} ({sentiment_label})
- Average response length: {analysis['avg_length']:.1f} words
- Key themes: {', '.join([term for term, _ in analysis['top_terms'][:5]])}
- Sample response: "{analysis['sample_responses'][0][:200]}..."

"""

if n_noise > 0:
    report += f"""
Noise/Outliers:
- Size: {n_noise} responses ({n_noise/len(export_df)*100:.1f}%)
- These are responses that don't fit well into any cluster
"""

report += f"""

📋 SUMMARY INSIGHTS
==================
"""

if n_clusters > 0:
    largest_cluster = max([a for a in cluster_analyses if a['cluster_id'] != -1], key=lambda x: x['size'])
    most_positive = max([a for a in cluster_analyses if a['cluster_id'] != -1], key=lambda x: x['avg_sentiment'])
    most_negative = min([a for a in cluster_analyses if a['cluster_id'] != -1], key=lambda x: x['avg_sentiment'])
    
    report += f"""
- Largest cluster: Cluster {largest_cluster['cluster_id']} with {largest_cluster['size']} responses
- Most positive sentiment: Cluster {most_positive['cluster_id']} (score: {most_positive['avg_sentiment']:.3f})
- Most negative sentiment: Cluster {most_negative['cluster_id']} (score: {most_negative['avg_sentiment']:.3f})
- Overall data quality: {((len(export_df) - n_noise) / len(export_df) * 100):.1f}% of responses were successfully clustered
"""

report += f"""

📁 OUTPUT FILES
===============
1. {output_filename} - Complete results with cluster assignments
2. cluster_summary_report.txt - This detailed analysis report

🚀 NEXT STEPS
=============
- Review cluster themes and validate with domain knowledge
- Use cluster assignments to target specific response groups
- Consider sub-clustering large clusters for deeper insights
- Apply findings to improve future surveys or processes

Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

# Save report
report_filename = 'cluster_summary_report.txt'
with open(report_filename, 'w') as f:
    f.write(report)

print(f"✅ Analysis report saved to: {report_filename}")
print(f"\n🎉 ANALYSIS COMPLETED SUCCESSFULLY!")
print(f"📊 Summary:")
print(f"   📋 Total responses: {len(export_df)}")
print(f"   🏷️  Clusters found: {n_clusters}")
print(f"   💾 Files generated:")
print(f"      - {output_filename}")
print(f"      - {report_filename}")

if len(summary_df) > 0:
    print(f"\n📊 Final Summary Table:")
    display(summary_df)