# Quick Survey Analysis

This notebook demonstrates techniques for analyzing survey data, including response mapping, statistical analysis, and visualization.

## Setup and Import

In [ ]:
# Import standard data science libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Statistical analysis
from scipy import stats

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline

# Import custom survey utility functions
from survey_utils import *

# Import column mapping dictionaries
from column_mappings import *

# Import Snowflake configuration and utilities
from snowflake_config import snowflake_cfg, connect_to_snowflake, execute_sql_file

# Path to SQL file
SQL_FILE_PATH = os.path.join(os.path.dirname(__file__), 'snowflake_queries.sql')

## Data Loading

Load your survey data from CSV, Excel, or other formats.

In [ ]:
# Snowflake connection configuration
# Update this with your actual Snowflake credentials
my_snowflake_cfg = snowflake_cfg.copy()
my_snowflake_cfg.update({
    "account": "your_account_identifier",
    "user": "your_username",
    "role": "ANALYST",
    "warehouse": "ANALYTICS_WH",
    "database": "SURVEY_DATA",
    "schema": "PUBLIC",
    "private_key_path": "",  # Set this to the path of your .p8 key file when ready
})

# SQL parameters for the temp table creation queries
sql_params = {
    "survey_id": "SURVEY_2023_Q4",  # Change as needed
    "start_date": "2023-10-01",     # Change as needed
    "end_date": "2023-12-31"        # Change as needed
}

# Set up Snowflake connection and temp tables
snowflake_conn = setup_snowflake_data(my_snowflake_cfg, SQL_FILE_PATH, sql_params)

In [ ]:
# Example: Query the temp tables
# Replace this with your own query to load the survey data
if snowflake_conn:
    # Insert your custom query here to load your data
    query = """
    -- Replace this with your query
    SELECT 
        *
    FROM 
        SURVEY_RESPONSES
    LIMIT 10
    """
    
    # Execute the query and load results into a DataFrame
    df = execute_query(snowflake_conn, query)
    
    if df is not None:
        print(f"Successfully loaded {len(df)} records from Snowflake")
        df.head()
    else:
        print("Query returned no results")
else:
    print("No Snowflake connection available")

## Data Exploration

Explore the dataset structure and contents. This section automatically:
1. Generates a comprehensive summary of all columns
2. Exports the summary to CSV and Excel files
3. Displays key dataset characteristics

# Basic data exploration
if 'df' in locals() and df is not None:
    print(f"Number of survey responses: {len(df)}")
    
    # Generate comprehensive data summary
    print("\nGenerating data summary...")
    data_summary = generate_data_summary(df)
    
    # Display summary information
    print("\nData Summary:")
    display(data_summary)
    
    # Export the data summary to a file
    export_path = os.path.join(os.path.dirname(SQL_FILE_PATH), 'data_summary.csv')
    export_data_summary(data_summary, export_path, 'csv')
    
    # Also export to Excel format for better readability
    excel_path = os.path.join(os.path.dirname(SQL_FILE_PATH), 'data_summary.xlsx')
    export_data_summary(data_summary, excel_path, 'excel')
    
    # Basic DataFrame info
    print("\nDataFrame Info:")
    df.info()

In [ ]:
# Function to style the data summary table for better visualization
def style_data_summary(summary_df):
    """Apply styling to the data summary table."""
    # Define styling function
    def highlight_fill_rate(val):
        """Highlight the fill rate column based on values."""
        if pd.isna(val):
            return ''
        
        # Color-coding based on fill rate percentage
        if val < 50:
            return 'background-color: #ffcccc'  # Light red
        elif val < 80:
            return 'background-color: #ffffcc'  # Light yellow
        else:
            return 'background-color: #ccffcc'  # Light green
    
    # Apply styling
    styled = summary_df.style.applymap(
        highlight_fill_rate, 
        subset=['Fill Rate (%)']
    ).set_properties(**{
        'text-align': 'left',
        'white-space': 'pre-wrap'
    }).set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#eaeaea'), 
                                    ('font-weight', 'bold'),
                                    ('text-align', 'left')]},
        {'selector': '.row_heading', 'props': [('display', 'none')]},  # Hide index
    ])
    
    return styled

# Display styled data summary if available
if 'data_summary' in locals() and not data_summary.empty:
    styled_summary = style_data_summary(data_summary)
    display(styled_summary)

In [ ]:
# Visualize data completeness
if 'data_summary' in locals() and not data_summary.empty:
    # Sort by fill rate for better visualization
    completeness_df = data_summary.sort_values('Fill Rate (%)', ascending=False)
    
    plt.figure(figsize=(12, max(6, len(data_summary) * 0.3)))
    
    # Create horizontal bar chart
    bars = plt.barh(completeness_df['Column'], completeness_df['Fill Rate (%)'], 
                    color=plt.cm.viridis(completeness_df['Fill Rate (%)'] / 100))
    
    # Add percentage labels to the bars
    for bar in bars:
        width = bar.get_width()
        label_x_pos = width + 1
        plt.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{width:.1f}%',
                 va='center', fontweight='bold')
    
    plt.xlabel('Fill Rate (%)')
    plt.title('Data Completeness by Column')
    plt.xlim(0, 105)  # Ensure room for percentage labels
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    # Visualize data types distribution
    type_counts = data_summary['Data Type'].value_counts()
    
    plt.figure(figsize=(8, 5))
    plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', 
            shadow=True, startangle=90, colors=plt.cm.tab10.colors)
    plt.title('Distribution of Data Types')
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
    plt.tight_layout()
    plt.show()

In [ ]:
# View response distributions for key columns
if 'df' in locals() and df is not None:
    # Identify columns with categorical data
    categorical_cols = []
    
    for col in df.columns:
        # Only include columns with reasonable cardinality
        if df[col].dtype == 'object' and 1 < df[col].nunique() < 15:
            categorical_cols.append(col)
    
    if categorical_cols:
        print("Response distributions for key categorical columns:")
        for col in categorical_cols[:5]:  # Limit to first 5 to avoid too much output
            print(f"\n{col.replace('_', ' ').title()} Distribution:")
            value_counts = df[col].value_counts().sort_index()
            print(value_counts)
            
            # Optional: Visualize the distribution
            plt.figure(figsize=(10, 6))
            sns.countplot(y=col, data=df, order=value_counts.index)
            plt.title(f'Distribution of {col.replace("_", " ").title()}')
            plt.tight_layout()
            plt.show()
    else:
        print("No suitable categorical columns found for distribution analysis.")

In [ ]:
# Apply mappings to your data
# Example: if 'response_text' exists, map it based on the question type
if 'response_text' in df.columns and 'question_category' in df.columns:
    # Create a new column for the numerical scores
    df['response_score'] = None
    
    # Apply different mappings based on question category
    mask = df['question_category'] == 'SATISFACTION'
    df.loc[mask, 'response_score'] = df.loc[mask].apply(
        lambda row: satisfaction_mapping.get(row['response_text'], None), axis=1)
    
    mask = df['question_category'] == 'AGREEMENT'
    df.loc[mask, 'response_score'] = df.loc[mask].apply(
        lambda row: likert_scale_mapping.get(row['response_text'], None), axis=1)
    
    mask = df['question_category'] == 'FREQUENCY'
    df.loc[mask, 'response_score'] = df.loc[mask].apply(
        lambda row: frequency_mapping.get(row['response_text'], None), axis=1)
    
    mask = df['question_category'] == 'YES_NO'
    df.loc[mask, 'response_score'] = df.loc[mask].apply(
        lambda row: yes_no_mapping.get(row['response_text'], None), axis=1)

# Examples of mappings for specific columns
# Uncomment and modify as needed for your specific data
"""
# Satisfaction columns
for col in satisfaction_columns:
    if col in df.columns:
        df[f"{col}_score"] = apply_mapping_if_exists(df, col, satisfaction_mapping)

# Likert scale columns (agreement-based)
likert_cols = ['product_quality', 'ease_of_use', 'recommend', 'value_for_money']
for col in likert_cols:
    if col in df.columns:
        df[f"{col}_score"] = apply_mapping_if_exists(df, col, likert_scale_mapping)

# Frequency columns
freq_cols = ['usage_frequency', 'visit_frequency', 'purchase_frequency']
for col in freq_cols:
    if col in df.columns:
        df[f"{col}_score"] = apply_mapping_if_exists(df, col, frequency_mapping)
"""

# Display sample of the mapped data
if 'response_score' in df.columns:
    df[['response_text', 'question_category', 'response_score']].head(10)

# Calculate statistics for numerical columns
if 'df' in locals() and df is not None:
    # Identify numerical columns
    numerical_cols = []
    
    # Add mapped score columns
    score_cols = [col for col in df.columns if col.endswith('_score')]
    numerical_cols.extend(score_cols)
    
    # Add any direct numerical columns
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64'] and col not in numerical_cols:
            numerical_cols.append(col)
    
    if numerical_cols:
        print("Response statistics for numerical columns:")
        for col in numerical_cols:
            if df[col].notna().sum() > 0:  # Only analyze if we have non-null values
                stats = calculate_response_stats(df, col)
                print(f"\n{col.replace('_score', '').replace('_', ' ').title()} Statistics:")
                for stat, value in stats.items():
                    print(f"{stat}: {value:.2f}")
                
                # Optional: Create boxplot for the numerical column
                plt.figure(figsize=(8, 6))
                sns.boxplot(y=df[col].dropna())
                plt.title(f'Distribution of {col.replace("_", " ").title()}')
                plt.tight_layout()
                plt.show()
    else:
        print("No numerical columns found for statistical analysis.")

In [None]:
# Calculate statistics for score columns
score_columns = [col for col in df.columns if col.endswith('_score')]

for col in score_columns:
    stats = calculate_response_stats(df, col)
    print(f"\n{col.replace('_score', '').replace('_', ' ').title()} Statistics:")
    for stat, value in stats.items():
        print(f"{stat}: {value:.2f}")

# Plot distribution of numerical columns
if 'df' in locals() and df is not None:
    # Identify numerical columns
    score_cols = [col for col in df.columns if col.endswith('_score')]
    
    if score_cols:
        # Pick the first score column for demonstration
        score_col = score_cols[0]
        print(f"Distribution plot for {score_col}:")
        plot_response_distribution(df, score_col, f'Distribution of {score_col.replace("_score", "").replace("_", " ").title()} Scores')
    else:
        print("No score columns found for distribution plotting.")

In [ ]:
# Correlation analysis for numerical columns
if 'df' in locals() and df is not None:
    # Identify numerical columns for correlation analysis
    numerical_cols = []
    
    # Add mapped score columns
    score_cols = [col for col in df.columns if col.endswith('_score')]
    numerical_cols.extend(score_cols)
    
    # Add any direct numerical columns
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64'] and col not in numerical_cols:
            numerical_cols.append(col)
    
    if len(numerical_cols) >= 2:
        print("Correlation analysis for numerical columns:")
        corr_matrix = correlation_matrix(df, numerical_cols)
        print(corr_matrix.round(2))
        
        # Visualize correlation matrix
        plot_correlation_heatmap(corr_matrix, 'Correlation Between Survey Metrics')
    else:
        print("Not enough numerical columns for correlation analysis.")

# Segment analysis example
# This needs to be adapted to your specific data structure
# Replace with your own demographic variables and response variables

# Example: If data is at the respondent level with multiple demographic fields
if 'respondent_id' in df.columns and 'response_score' in df.columns:
    # Identify demographic columns from our predefined list that exist in the dataframe
    available_demographic_cols = [col for col in demographic_columns if col in df.columns]
    
    if available_demographic_cols:
        print("Analyzing responses by demographic segments:")
        for segment in available_demographic_cols:
            # Only perform analysis if we have multiple values in the segment
            if df[segment].nunique() > 1:
                # For respondent-level data, can analyze directly
                segment_analysis_result = segment_analysis(df, segment, 'response_score')
                print(f"\n{segment.replace('_', ' ').title()} Segment Analysis:")
                print(segment_analysis_result)
                
                # Visualize if possible
                if len(segment_analysis_result) > 0 and len(segment_analysis_result) <= 10:
                    plt.figure(figsize=(10, 6))
                    sns.barplot(x=segment_analysis_result.index, y=segment_analysis_result.values)
                    plt.title(f'Average Response by {segment.replace("_", " ").title()}')
                    plt.ylabel('Average Response Score')
                    plt.xticks(rotation=45)
                    plt.tight_layout()
                    plt.show()
    else:
        print("No demographic columns from the predefined list are available in the dataset.")

# Example: If data is at the question-response level and needs aggregation
elif 'question_id' in df.columns and 'respondent_id' in df.columns and 'response_score' in df.columns:
    # Identify demographic columns available in the data
    available_demographic_cols = [col for col in demographic_columns if col in df.columns]
    
    if available_demographic_cols and 'question_category' in df.columns:
        print("Analyzing responses by demographic segments and question category:")
        
        # Example: Analyze average scores by demographic segment and question category
        for segment in available_demographic_cols:
            if df[segment].nunique() > 1:
                # Group by segment and question category
                grouped = df.groupby([segment, 'question_category'])['response_score'].mean().reset_index()
                
                # Pivot to create a table with segment rows and question category columns
                pivot_table = grouped.pivot(index=segment, columns='question_category', values='response_score')
                
                print(f"\n{segment.replace('_', ' ').title()} by Question Category:")
                print(pivot_table)
    else:
        print("Required columns for segment analysis are not available.")

In [None]:
# Visualize segment analysis
plt.figure(figsize=(10, 6))
sns.barplot(x=satisfaction_by_age.index, y=satisfaction_by_age.values)
plt.title('Average Satisfaction by Age Group')
plt.ylabel('Average Satisfaction Score')
plt.ylim(1, 5)
plt.tight_layout()
plt.show()

# Cross-tabulation analysis example
if 'df' in locals() and df is not None:
    # Identify categorical columns with reasonable cardinality
    categorical_cols = []
    
    for col in df.columns:
        if df[col].dtype == 'object' and 1 < df[col].nunique() < 10:
            categorical_cols.append(col)
    
    if len(categorical_cols) >= 2:
        # Select first two categorical columns for cross-tab example
        col1, col2 = categorical_cols[:2]
        
        print(f"Cross-tabulation of {col1} by {col2}:")
        cross_tab = response_cross_tabulation(df, col1, col2, normalize='index')
        print(cross_tab.round(2))
        
        # Visualize the cross-tabulation
        plt.figure(figsize=(12, 8))
        sns.heatmap(cross_tab, annot=True, cmap='viridis', fmt='.2f')
        plt.title(f'Cross-tabulation of {col1} by {col2} (Normalized by Row)')
        plt.tight_layout()
        plt.show()
    else:
        print("Not enough categorical columns for cross-tabulation analysis.")

# Free text analysis example
if 'df' in locals() and df is not None:
    # Identify potential text columns (longer text fields)
    text_cols = []
    
    for col in df.columns:
        if df[col].dtype == 'object':
            # Check if this appears to be a free text field (average length > 20 chars)
            avg_len = df[col].astype(str).str.len().mean()
            if avg_len > 20:
                text_cols.append(col)
    
    if text_cols:
        # Example: Word count analysis on first identified text column
        text_col = text_cols[0]
        print(f"Word count analysis for {text_col}:")
        
        word_counts = free_text_word_count(df, text_col, min_count=2)
        if not word_counts.empty:
            print(word_counts.head(10))
            
            # Visualize top words
            plt.figure(figsize=(12, 6))
            sns.barplot(x=word_counts.index[:15], y=word_counts.values[:15])
            plt.title(f'Top Words in {text_col}')
            plt.xlabel('Word')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
        else:
            print(f"No meaningful word counts found in {text_col}")
    else:
        print("No free text fields identified for text analysis.")

In [None]:
# Get word counts from feedback
# This is a simplified example - in practice you would want to:
# 1. Remove stop words (common words like 'the', 'and', etc.)
# 2. Tokenize properly, considering n-grams
# 3. Apply stemming or lemmatization
# 4. Consider using NLP libraries like NLTK or spaCy

word_counts = free_text_word_count(df, 'feedback', min_count=5)
word_counts.head(10)

## Key Insights and Recommendations

Summarize key findings from the survey analysis and provide recommendations.

Based on the analysis above, key insights include:

1. [Add insights based on actual analysis]
2. ...

Recommendations:

1. [Add recommendations based on insights]
2. ...