# MLP Visualiser

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

In [None]:
df = pd.read_csv('ous_data/ous_align2.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# df1 = df[['IB3', 'IH1', 'IB2', 'IH2', 'IB1', 'IH3', 'IB4', 'IH4', 'IB5',
#         'native_language', 'Q_Lang',
#        'religion_1', 'religion_2', 'religion_3', 'religion_4', 'religion_5',
#        'religion_6', 'religion_7', 'religion_8', 'religion_9', 'religion_10',
#        'religion_11', 'religion_12', 'religion_13', 'religion_14',
#        'religion_15', 'religion_16', 'education_leve', 'sex', 'countr_origin_1', 'country3',
#        'Age']]

## Data processing

In [None]:
def analyze_questionnaire(df):
    """
    Create visualisations for questionnaire data based on demographic factors.
    
    Parameters:
    df: pandas DataFrame with columns:
        - First 9 columns: questionnaire answers
        - age: respondent's age
        - educationlevel: education level
        - country: country of residence
        - gender: respondent's gender
    """

    # Set up the plotting style
    plt.style.use('ggplot')
    
    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Average responses by age groups
    plt.subplot(2, 2, 1)

    # Create age groups
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 100], 
                            labels=['18-25', '26-35', '36-45', '46-55', '55+'])
    
    # Calculate mean responses for first 9 questions by age group
    age_means = df.iloc[:, :9].groupby(df['age_group']).mean()
    
    # Create heatmap
    sns.heatmap(age_means, cmap='YlOrRd', annot=True, fmt='.2f')
    plt.title('Average Responses by Age Group')
    plt.xlabel('Question Number')
    plt.ylabel('Age Group')
    
    # 2. Response distribution by education level
    plt.subplot(2, 2, 2)
    edu_means = df.iloc[:, :10].groupby(df['education_level']).mean().T
    edu_means.plot(kind='bar', ax=plt.gca())
    plt.title('Response Patterns by Education Level')
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.legend(title='Education Level', bbox_to_anchor=(1.05, 1))
    plt.xticks(rotation=45)
    
    # 3. Country-wise response patterns (top 9 countries)
    plt.subplot(2, 2, 3)
    top_countries = df['country'].value_counts().nlargest(9).index
    country_data = df[df['country'].isin(top_countries)]
    country_means = df.iloc[:, :10].groupby(df['country']).mean()
    
    sns.boxplot(data=country_data.melt(id_vars=['country'], value_vars=df.columns[:10]), 
                x='country', y='value')
    plt.title('Response Distribution by Country (Top 10)')
    plt.xticks(rotation=45)
    plt.xlabel('Country')
    plt.ylabel('Response Value')
    
    # 4. Gender comparison
    plt.subplot(2, 2, 3)
    gender_means = df.iloc[:, :10].groupby(df['sex']).mean().T
    gender_means.plot(kind='line', marker='o')
    plt.title('Gender Comparison Across Questions')
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.legend(title='Gender')
    
    plt.tight_layout()
    
    return fig

In [None]:
fig = analyze_questionnaire(df)
plt.show()

In [None]:
def plot_continent_analysis(df):
    """
    Create visualizations for questionnaire responses by continent.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and a 'continent' column
    """
    plt.figure(figsize=(12, 8))
    
    # Calculate mean responses for each continent
    continent_means = df.iloc[:, :9].groupby(df['continent']).mean()
    
    # Create heatmap
    sns.heatmap(continent_means, 
                cmap='viridis',
                annot=True, 
                fmt='.2f',
                cbar_kws={'label': 'Average Response'})
    
    plt.title('Average Questionnaire Responses by Continent')
    plt.xlabel('Question Number')
    plt.ylabel('Continent')
    
    # Adjust layout
    plt.tight_layout()
    
    return plt.gcf()


In [None]:
fig = plot_continent_analysis(df)
plt.show()

In [None]:
df_nat = pd.read_csv('data/20241124101056nationalities.csv')

In [None]:
nationality_to_continent = {
    'Indian': 'Asia',
    'Chinese': 'Asia',
    'American': 'North America',
    'Canadian': 'North America',
    'British': 'Europe',
    'French': 'Europe',
    'Emirati': 'Asia',
    'Malaysian': 'Asia',
    'Lebanese': 'Asia',
    'Thai': 'Asia',
    'Macedonian': 'Europe',
    'Pakistani': 'Asia',
    'Iranian': 'Asia',
    'Japanese': 'Asia',
    'Hungarian': 'Europe',
    'Colombian': 'South America',
    'Argentinian': 'South America',
    'Slovak': 'South America',  # Note: Slovakia is actually in Europe
    'Turkish': 'Asia',
    'Ecuadorian': 'South America',
    'Chilean': 'South America',
    'Czech': 'Europe',
    'Peruvian': 'South America',
    'Filipino': 'Asia',
    'Mexican': 'North America',
    'Serbian': 'Europe',
    'Russian': 'Asia',
    'German': 'Europe',
    'Austrian': 'Europe',
    'Polish': 'Europe',
    'Danish': 'Europe',
    'Italian': 'Europe',
    'Australian': 'Oceania',
    'Portuguese': 'Europe',
    'Kazakh': 'Asia',
    'Greek': 'Europe',
    'Spanish': 'Europe',
    'Bulgarian': 'Europe',
    'New Zealander': 'Oceania',  # Also commonly "Kiwi"
    'Brazilian': 'South America',
    'Dutch': 'Europe',
    'Croatian': 'Europe',
    'Romanian': 'Europe',
    'Swiss': 'Europe',
    'Singaporean': 'Asia'
}

In [None]:
def plot_combined_continent_analysis(df_old, df_new, nationality_to_continent):
    """
    Create error graph combining old and new data formats, grouped by continents.
    
    Parameters:
    df_old: Original DataFrame with questionnaire columns
    df_new: New DataFrame with columns: base, persona, question, score, etc.
    nationality_to_continent: Dictionary mapping nationalities to continents
    """
    plt.figure(figsize=(15, 8))
    
    # Process old format data
    if df_old is not None:
        # Map continents
        df_old['continent'] = df_old['continent'].map(nationality_to_continent)
        
        # Calculate means and standard errors for first 10 questions
        means_old = df_old.iloc[:, :9].groupby(df_old['continent']).mean()
        errors_old = df_old.iloc[:, :9].groupby(df_old['continent']).sem()
        
        # Plot old format data
        x_old = range(10)
        for continent in means_old.index:
            plt.errorbar(x_old, 
                        means_old.loc[continent], 
                        yerr=errors_old.loc[continent],
                        fmt='o--',  # dashed line with circles
                        capsize=3,
                        label=f'{continent} (Original)',
                        markersize=6,
                        alpha=0.7)  # slight transparency
    
    # Process new format data
    if df_new is not None:
        # Map continents
        df_new['continent'] = df_new['persona'].map(nationality_to_continent)
        
        # Calculate means and standard errors
        means_new = df_new.groupby(['continent', 'question'])['score'].mean().unstack()
        errors_new = df_new.groupby(['continent', 'question'])['score'].sem().unstack()
        
        # Plot new format data
        x_new = range(len(df_new['question'].unique()))
        for continent in means_new.index:
            plt.errorbar(x_new, 
                        means_new.loc[continent], 
                        yerr=errors_new.loc[continent],
                        fmt='s-',  # solid line with squares
                        capsize=3,
                        label=f'{continent} (New)',
                        markersize=6)
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Score')
    plt.title('Response Analysis by Continent\nCombining Both Data Formats')
    plt.xticks(range(max(len(x_old) if df_old is not None else 0, 
                        len(x_new) if df_new is not None else 0)), 
               [f'Q{i}' for i in range(max(len(x_old) if df_old is not None else 0,
                                         len(x_new) if df_new is not None else 0))])
    plt.grid(True, alpha=0.3)
    
    # Adjust legend position and style
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Set y-axis limits
    plt.ylim(0, 5)
    
    plt.tight_layout()
    return plt.gcf()


In [None]:
fig = plot_combined_continent_analysis(df, df_nat, nationality_to_continent)

In [None]:
def plot_simple_error_graph(df):
    """
    Create a simple error graph showing mean responses and standard errors by continent.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and continent column
    """
    plt.figure(figsize=(12, 6))
    
    # Calculate means and standard errors
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    errors = df.iloc[:, :9].groupby(df['continent']).sem()
    
    # Create x-axis points
    x = range(9)  # for 9 questions
    
    # Plot for each continent
    for continent in means.index:
        plt.errorbar(x, 
                    means.loc[continent], 
                    yerr=errors.loc[continent],
                    fmt='o-',  # line with circles
                    capsize=3,  # error bar cap width
                    label=continent,
                    markersize=6)
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title('Average Responses by Continent')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    plt.tight_layout()
    return plt.gcf()


In [None]:
def combined_plot_2(df_old, df_new, nationality_to_continent = nationality_to_continent):
    """
    Create error graph showing old and new datasets as separate lines.
    
    Parameters:
    df_old: Original DataFrame with questionnaire columns
    df_new: New DataFrame with columns: base, persona, question, score, etc.
    nationality_to_continent: Dictionary mapping nationalities to continents
    """
    plt.figure(figsize=(15, 8))
    
    # Process old format data
    if df_old is not None:
        # Map continents
        df_old['continent'] = df_old['continent'].map(nationality_to_continent)
        
        # Calculate mean and standard error across all responses
        means_old = df_old.iloc[:, :9].mean()
        errors_old = df_old.iloc[:, :9].sem()
        
        # Plot old format data
        x_old = range(9)
        plt.errorbar(x_old, 
                    means_old, 
                    yerr=errors_old,
                    fmt='o-',  # solid line with circles
                    capsize=3,
                    label='Original Dataset',
                    markersize=6,
                    color='blue')
    
    # Process new format data
    if df_new is not None:
        # Calculate mean and standard error for each question
        means_new = df_new.groupby('question')['score'].mean()
        errors_new = df_new.groupby('question')['score'].sem()
        
        # Plot new format data
        x_new = range(len(means_new))
        plt.errorbar(x_new, 
                    means_new, 
                    yerr=errors_new,
                    fmt='s-',  # solid line with squares
                    capsize=3,
                    label='New Dataset',
                    markersize=6,
                    color='red')
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Score')
    plt.title('Comparison of Dataset Responses\nwith Standard Error Bars')
    
    # Set x-ticks based on the larger dataset
    max_questions = max(len(x_old) if df_old is not None else 0,
                       len(x_new) if df_new is not None else 0)
    plt.xticks(range(max_questions), [f'Q{i}' for i in range(max_questions)])
    
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Set y-axis limits
    plt.ylim(0, 7)
    
    plt.tight_layout()
    return plt.gcf()


In [None]:
fig = combined_plot_2(df, df_nat)

In [None]:
def combined_plot_3(df_old, df_new, nationality_to_continent):
    """
    Create error graph showing continent-specific lines for each dataset,
    using different shades of colors for continents within each dataset.
    """
    plt.figure(figsize=(15, 8))
    
    # Color maps for each dataset
    old_colors = {'Asia': '#1f77b4',         # dark blue
                 'Europe': '#4e9fd3',         # medium blue
                 'North America': '#7ec7eb',  # light blue
                 'South America': '#a8e0ff',  # very light blue
                 'Oceania': '#cceeff'}        # pale blue
    
    new_colors = {'Asia': '#d62728',         # dark red
                 'Europe': '#e74c4c',         # medium red
                 'North America': '#f47171',  # light red
                 'South America': '#ff9696',  # very light red
                 'Oceania': '#ffbaba'}        # pale red
    
    # Process old format data
    if df_old is not None:
        # Map continents
        df_old['continent'] = df_old['continent'].map(nationality_to_continent)
        
        # Calculate means and standard errors per continent
        means_old = df_old.iloc[:, :9].groupby(df_old['continent']).mean()
        errors_old = df_old.iloc[:, :9].groupby(df_old['continent']).sem()
        
        # Plot old format data for each continent
        x_old = range(9)
        for continent in means_old.index:
            if len(means_old.loc[continent]) > 0:  # Check if continent has data
                plt.errorbar(x_old, 
                           means_old.loc[continent], 
                           yerr=errors_old.loc[continent],
                           fmt='o-',  # solid line with circles
                           capsize=3,
                           label=f'{continent} (Original)',
                           markersize=6,
                           color=old_colors.get(continent, '#000000'))
    
    # Process new format data
    if df_new is not None:
        # Map continents
        df_new['continent'] = df_new['persona'].map(nationality_to_continent)
        
        # Calculate means and standard errors per continent
        means_new = df_new.groupby(['continent', 'question'])['score'].mean().unstack()
        errors_new = df_new.groupby(['continent', 'question'])['score'].sem().unstack()
        
        # Plot new format data for each continent
        x_new = range(len(df_new['question'].unique()))
        for continent in means_new.index:
            if len(means_new.loc[continent]) > 0:  # Check if continent has data
                plt.errorbar(x_new, 
                           means_new.loc[continent], 
                           yerr=errors_new.loc[continent],
                           fmt='s-',  # solid line with squares
                           capsize=3,
                           label=f'{continent} (New)',
                           markersize=6,
                           color=new_colors.get(continent, '#000000'))
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Score')
    plt.title('Scores by Continent for Both Datasets\nwith Standard Error Bars')
    
    # Set x-ticks based on the larger dataset
    max_questions = max(len(x_old) if df_old is not None else 0,
                       len(x_new) if df_new is not None else 0)
    plt.xticks(range(max_questions), [f'Q{i}' for i in range(max_questions)])
    
    plt.grid(True, alpha=0.3)
    
    # Adjust legend position to prevent overlap
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Set y-axis limits
    plt.ylim(0, 8)
    
    plt.tight_layout()
    return plt.gcf()

In [None]:
fig = combined_plot_3(df, df_nat, nationality_to_continent)


In [None]:
def combined_plot_4(df_old, df_new, nationality_to_continent=nationality_to_continent):
    """
    Create error graph showing continent-specific lines for each dataset,
    with clear color distinction between old and new datasets.
    """
    plt.figure(figsize=(15, 8))
    
    # Add text boxes to show dataset colors
    plt.figtext(1.02, 0.7, 'Original Dataset\n(Blues)', 
                bbox=dict(facecolor='white', alpha=0.8, edgecolor='blue'),
                color='blue')
    plt.figtext(1.02, 0.6, 'New Dataset\n(Reds)', 
                bbox=dict(facecolor='white', alpha=0.8, edgecolor='red'),
                color='red')
    
    # Color maps for each dataset
    continent_order = ['Asia', 'Europe', 'North America', 'South America', 'Oceania']
    
    # Create color gradients using deeper blues and reds
    old_colors = {
        'Asia': '#000080',        # navy blue
        'Europe': '#0000FF',      # blue
        'North America': '#4169E1',  # royal blue
        'South America': '#6495ED',  # cornflower blue
        'Oceania': '#87CEEB'      # sky blue
    }
    
    new_colors = {
        'Asia': '#8B0000',        # dark red
        'Europe': '#DC143C',      # crimson
        'North America': '#FF0000',  # red
        'South America': '#FA8072',  # salmon
        'Oceania': '#FFB6C1'      # light pink
    }
    
    # Process old format data
    if df_old is not None:
        df_old['continent'] = df_old['continent'].map(nationality_to_continent)
        means_old = df_old.iloc[:, :9].groupby(df_old['continent']).mean()
        errors_old = df_old.iloc[:, :9].groupby(df_old['continent']).sem()
        
        x_old = range(9)
        # Plot in specific order to maintain consistent color assignment
        for continent in continent_order:
            if continent in means_old.index:
                plt.errorbar(x_old, 
                           means_old.loc[continent], 
                           yerr=errors_old.loc[continent],
                           fmt='o-',  # solid line with circles
                           capsize=3,
                           label=f'{continent} (Original)',
                           markersize=6,
                           color=old_colors[continent],
                           alpha=0.8)
    
    # Process new format data
    if df_new is not None:
        df_new['continent'] = df_new['persona'].map(nationality_to_continent)
        means_new = df_new.groupby(['continent', 'question'])['score'].mean().unstack()
        errors_new = df_new.groupby(['continent', 'question'])['score'].sem().unstack()
        
        x_new = range(len(df_new['question'].unique()))
        # Plot in specific order to maintain consistent color assignment
        for continent in continent_order:
            if continent in means_new.index:
                plt.errorbar(x_new, 
                           means_new.loc[continent], 
                           yerr=errors_new.loc[continent],
                           fmt='s-',  # solid line with squares
                           capsize=3,
                           label=f'{continent} (New)',
                           markersize=6,
                           color=new_colors[continent],
                           alpha=0.8)
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Score')
    plt.title('Scores by Continent for Both Datasets\nwith Standard Error Bars')
    
    max_questions = max(len(x_old) if df_old is not None else 0,
                       len(x_new) if df_new is not None else 0)
    plt.xticks(range(max_questions), [f'Q{i}' for i in range(max_questions)])
    
    plt.grid(True, alpha=0.3)
    
    # Adjust legend position and sort it
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.ylim(0, 7)
    
    # Increase right margin to accommodate dataset labels
    plt.subplots_adjust(right=0.85)
    
    return plt.gcf()


In [None]:
fig = combined_plot_4(df, df_nat)
plt.show()

In [None]:
def combined_plot_5(df_old, df_new, nationality_to_continent = nationality_to_continent):
    """
    Create error graph showing continent-specific lines for each dataset,
    with clear color distinction between old and new datasets.
    """
    plt.figure(figsize=(15, 8))
    
    # Add text boxes to show dataset colors
    plt.figtext(1.02, 0.7, 'Original Dataset\n(Blues)', 
                bbox=dict(facecolor='white', alpha=0.8, edgecolor='blue'),
                color='blue')
    plt.figtext(1.02, 0.6, 'New Dataset\n(Reds)', 
                bbox=dict(facecolor='white', alpha=0.8, edgecolor='red'),
                color='red')
    
    continent_order = ['Asia', 'Europe', 'North America', 'South America', 'Oceania']
    
    old_colors = {
        'Asia': '#000080',        # navy blue
        'Europe': '#0000FF',      # blue
        'North America': '#4169E1',  # royal blue
        'South America': '#6495ED',  # cornflower blue
        'Oceania': '#87CEEB'      # sky blue
    }
    
    new_colors = {
        'Asia': '#8B0000',        # dark red
        'Europe': '#DC143C',      # crimson
        'North America': '#FF0000',  # red
        'South America': '#FA8072',  # salmon
        'Oceania': '#FFB6C1'      # light pink
    }
    
    # Process old format data
    if df_old is not None:
        df_old['continent'] = df_old['continent'].map(nationality_to_continent)
        
        # Get question columns (assuming they're the first 10 numeric columns)
        question_cols = df_old.select_dtypes(include=['float64', 'int64']).columns[:10]
        
        means_old = df_old.groupby('continent')[question_cols].mean()
        errors_old = df_old.groupby('continent')[question_cols].sem()
        
        x_old = range(len(question_cols))
        
        # Plot in specific order to maintain consistent color assignment
        for continent in continent_order:
            if continent in means_old.index:
                values = means_old.loc[continent].values
                errors = errors_old.loc[continent].values
                plt.errorbar(x_old, 
                           values,
                           yerr=errors,
                           fmt='o-',  # solid line with circles
                           capsize=3,
                           label=f'{continent} (Original)',
                           markersize=6,
                           color=old_colors[continent],
                           alpha=0.8)
    
    # Process new format data
    if df_new is not None:
        df_new['continent'] = df_new['persona'].map(nationality_to_continent)
        means_new = df_new.groupby(['continent', 'question'])['score'].mean().unstack()
        errors_new = df_new.groupby(['continent', 'question'])['score'].sem().unstack()
        
        x_new = range(len(df_new['question'].unique()))
        
        for continent in continent_order:
            if continent in means_new.index:
                plt.errorbar(x_new, 
                           means_new.loc[continent], 
                           yerr=errors_new.loc[continent],
                           fmt='s-',  # solid line with squares
                           capsize=3,
                           label=f'{continent} (New)',
                           markersize=6,
                           color=new_colors[continent],
                           alpha=0.8)
    
    plt.xlabel('Question Number')
    plt.ylabel('Score')
    plt.title('Scores by Continent for Both Datasets\nwith Standard Error Bars')
    
    max_questions = max(len(x_old) if df_old is not None else 0,
                       len(x_new) if df_new is not None else 0)
    plt.xticks(range(max_questions), [f'Q{i}' for i in range(max_questions)])
    
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.ylim(0, 7)
    
    # Increase right margin to accommodate dataset labels
    plt.subplots_adjust(right=0.85)
    
    return plt.gcf()

# Example usage:
"""
fig = plot_separate_analysis(df_old, df_new, nationality_to_continent)
plt.show()

# To save:
fig.savefig('continent_comparison.png', dpi=300, bbox_inches='tight')
"""

In [None]:
fig = combined_plot_5(df, df_nat)
plt.show()

In [None]:
fig = plot_simple_error_graph(df)
plt.show()

In [None]:
def plot_error_analysis(df, error_type='ci'):
    """
    Create error graph for questionnaire responses.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses
    error_type: str, 'ci' for 95% confidence interval or 'se' for standard error
    """

    plt.figure(figsize=(12, 6))
    
    # Calculate means for each question by continent
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    
    if error_type == 'ci':
        # Calculate 95% confidence intervals
        errors = df.iloc[:, :9].groupby(df['continent']).agg(lambda x: stats.t.interval(
            # alpha=0.95,
            df=len(x)-1,
            loc=np.mean(x),
            scale=stats.sem(x), 
            confidence=0
        ))
        # Extract lower and upper bounds
        error_bars = np.array([(mean - errors.xs(continent)[col][0], 
                              errors.xs(continent)[col][1] - mean)
                             for continent in means.index
                             for col, mean in means.loc[continent].items()])
        error_bars = error_bars.reshape(len(means), len(means.columns), 2).T
        
    else:  # Standard error
        errors = df.iloc[:, :9].groupby(df['continent']).sem()
        error_bars = errors.values.T

    # Create line plot with error bars
    x = np.arange(len(means.columns))
    
    for i, continent in enumerate(means.index):
        if error_type == 'ci':
            plt.errorbar(x, means.loc[continent], 
                        yerr=error_bars[:, i, :],
                        fmt='o-', 
                        label=continent,
                        capsize=5,
                        markersize=8,
                        linewidth=2)
        else:
            plt.errorbar(x, means.loc[continent], 
                        yerr=errors.loc[continent],
                        fmt='o-', 
                        label=continent,
                        capsize=5,
                        markersize=8,
                        linewidth=2)

    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title(f'Questionnaire Responses by Continent\nwith {"95% Confidence Intervals" if error_type=="ci" else "Standard Errors"}')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(title='Continent', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return plt.gcf()


In [None]:
# # For confidence intervals:
# plot_error_analysis(df)

# # # For standard errors:
fig = plot_error_analysis(df, error_type='se')
plt.show()

In [None]:
def plot_with_error_checking(df):
    """
    Create error graph with data quality checks.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and continent column
    """
    # Data quality checks
    print("Data Quality Report:")
    print("-" * 50)
    
    # Check for missing values
    missing = df.iloc[:, :9].isnull().sum()
    if missing.any():
        print("\nMissing values per question:")
        print(missing[missing > 0])
    
    # Check for out-of-range values (assuming valid range is 1-5)
    invalid = df.iloc[:, :9].apply(lambda x: sum((x < 1) | (x > 5)))
    if invalid.any():
        print("\nInvalid values per question:")
        print(invalid[invalid > 0])
    
    # Check sample size per continent
    sample_sizes = df['continent'].value_counts()
    print("\nSample size per continent:")
    print(sample_sizes)
    
    # Create the visualization
    plt.figure(figsize=(12, 6))
    
    # Calculate means and standard errors
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    errors = df.iloc[:, :9].groupby(df['continent']).sem()
    
    # Plot for each continent
    x = range(9)
    for continent in means.index:
        plt.errorbar(x, 
                    means.loc[continent], 
                    yerr=errors.loc[continent],
                    fmt='o-',
                    capsize=3,
                    label=f"{continent} (n={sample_sizes[continent]})",
                    markersize=6)
    
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title('Average Responses by Continent\nwith Standard Error Bars')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    return plt.gcf()

In [None]:
fig = plot_with_error_checking(df)
plt.show()