# MLP Visualiser

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

In [None]:
df = pd.read_csv('ous_data/ous_align2.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# df1 = df[['IB3', 'IH1', 'IB2', 'IH2', 'IB1', 'IH3', 'IB4', 'IH4', 'IB5',
#         'native_language', 'Q_Lang',
#        'religion_1', 'religion_2', 'religion_3', 'religion_4', 'religion_5',
#        'religion_6', 'religion_7', 'religion_8', 'religion_9', 'religion_10',
#        'religion_11', 'religion_12', 'religion_13', 'religion_14',
#        'religion_15', 'religion_16', 'education_leve', 'sex', 'countr_origin_1', 'country3',
#        'Age']]

## Data processing

In [None]:
def analyze_questionnaire(df):
    """
    Create visualisations for questionnaire data based on demographic factors.
    
    Parameters:
    df: pandas DataFrame with columns:
        - First 9 columns: questionnaire answers
        - age: respondent's age
        - educationlevel: education level
        - country: country of residence
        - gender: respondent's gender
    """

    # Set up the plotting style
    plt.style.use('ggplot')
    
    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Average responses by age groups
    plt.subplot(2, 2, 1)

    # Create age groups
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 100], 
                            labels=['18-25', '26-35', '36-45', '46-55', '55+'])
    
    # Calculate mean responses for first 9 questions by age group
    age_means = df.iloc[:, :9].groupby(df['age_group']).mean()
    
    # Create heatmap
    sns.heatmap(age_means, cmap='YlOrRd', annot=True, fmt='.2f')
    plt.title('Average Responses by Age Group')
    plt.xlabel('Question Number')
    plt.ylabel('Age Group')
    
    # 2. Response distribution by education level
    plt.subplot(2, 2, 2)
    edu_means = df.iloc[:, :10].groupby(df['education_level']).mean().T
    edu_means.plot(kind='bar', ax=plt.gca())
    plt.title('Response Patterns by Education Level')
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.legend(title='Education Level', bbox_to_anchor=(1.05, 1))
    plt.xticks(rotation=45)
    
    # 3. Country-wise response patterns (top 9 countries)
    plt.subplot(2, 2, 3)
    top_countries = df['country'].value_counts().nlargest(9).index
    country_data = df[df['country'].isin(top_countries)]
    country_means = df.iloc[:, :10].groupby(df['country']).mean()
    
    sns.boxplot(data=country_data.melt(id_vars=['country'], value_vars=df.columns[:10]), 
                x='country', y='value')
    plt.title('Response Distribution by Country (Top 10)')
    plt.xticks(rotation=45)
    plt.xlabel('Country')
    plt.ylabel('Response Value')
    
    # 4. Gender comparison
    plt.subplot(2, 2, 3)
    gender_means = df.iloc[:, :10].groupby(df['sex']).mean().T
    gender_means.plot(kind='line', marker='o')
    plt.title('Gender Comparison Across Questions')
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.legend(title='Gender')
    
    plt.tight_layout()
    
    return fig

In [None]:
fig = analyze_questionnaire(df)
plt.show()

In [None]:
def plot_continent_analysis(df):
    """
    Create visualizations for questionnaire responses by continent.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and a 'continent' column
    """
    plt.figure(figsize=(12, 8))
    
    # Calculate mean responses for each continent
    continent_means = df.iloc[:, :9].groupby(df['continent']).mean()
    
    # Create heatmap
    sns.heatmap(continent_means, 
                cmap='viridis',
                annot=True, 
                fmt='.2f',
                cbar_kws={'label': 'Average Response'})
    
    plt.title('Average Questionnaire Responses by Continent')
    plt.xlabel('Question Number')
    plt.ylabel('Continent')
    
    # Adjust layout
    plt.tight_layout()
    
    return plt.gcf()


In [None]:
fig = plot_continent_analysis(df)
plt.show()

In [None]:
def plot_simple_error_graph(df):
    """
    Create a simple error graph showing mean responses and standard errors by continent.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and continent column
    """
    plt.figure(figsize=(12, 6))
    
    # Calculate means and standard errors
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    errors = df.iloc[:, :9].groupby(df['continent']).sem()
    
    # Create x-axis points
    x = range(9)  # for 9 questions
    
    # Plot for each continent
    for continent in means.index:
        plt.errorbar(x, 
                    means.loc[continent], 
                    yerr=errors.loc[continent],
                    fmt='o-',  # line with circles
                    capsize=3,  # error bar cap width
                    label=continent,
                    markersize=6)
    
    # Customize the plot
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title('Average Responses by Continent')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    plt.tight_layout()
    return plt.gcf()


In [None]:
fig = plot_simple_error_graph(df)
plt.show()

In [None]:
def plot_error_analysis(df, error_type='ci'):
    """
    Create error graph for questionnaire responses.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses
    error_type: str, 'ci' for 95% confidence interval or 'se' for standard error
    """

    plt.figure(figsize=(12, 6))
    
    # Calculate means for each question by continent
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    
    if error_type == 'ci':
        # Calculate 95% confidence intervals
        errors = df.iloc[:, :9].groupby(df['continent']).agg(lambda x: stats.t.interval(
            # alpha=0.95,
            df=len(x)-1,
            loc=np.mean(x),
            scale=stats.sem(x), 
            confidence=0
        ))
        # Extract lower and upper bounds
        error_bars = np.array([(mean - errors.xs(continent)[col][0], 
                              errors.xs(continent)[col][1] - mean)
                             for continent in means.index
                             for col, mean in means.loc[continent].items()])
        error_bars = error_bars.reshape(len(means), len(means.columns), 2).T
        
    else:  # Standard error
        errors = df.iloc[:, :9].groupby(df['continent']).sem()
        error_bars = errors.values.T

    # Create line plot with error bars
    x = np.arange(len(means.columns))
    
    for i, continent in enumerate(means.index):
        if error_type == 'ci':
            plt.errorbar(x, means.loc[continent], 
                        yerr=error_bars[:, i, :],
                        fmt='o-', 
                        label=continent,
                        capsize=5,
                        markersize=8,
                        linewidth=2)
        else:
            plt.errorbar(x, means.loc[continent], 
                        yerr=errors.loc[continent],
                        fmt='o-', 
                        label=continent,
                        capsize=5,
                        markersize=8,
                        linewidth=2)

    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title(f'Questionnaire Responses by Continent\nwith {"95% Confidence Intervals" if error_type=="ci" else "Standard Errors"}')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(title='Continent', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    return plt.gcf()


In [None]:
# # For confidence intervals:
# plot_error_analysis(df)

# # # For standard errors:
fig = plot_error_analysis(df, error_type='se')
plt.show()

In [None]:
def plot_with_error_checking(df):
    """
    Create error graph with data quality checks.
    
    Parameters:
    df: pandas DataFrame with questionnaire responses and continent column
    """
    # Data quality checks
    print("Data Quality Report:")
    print("-" * 50)
    
    # Check for missing values
    missing = df.iloc[:, :9].isnull().sum()
    if missing.any():
        print("\nMissing values per question:")
        print(missing[missing > 0])
    
    # Check for out-of-range values (assuming valid range is 1-5)
    invalid = df.iloc[:, :9].apply(lambda x: sum((x < 1) | (x > 5)))
    if invalid.any():
        print("\nInvalid values per question:")
        print(invalid[invalid > 0])
    
    # Check sample size per continent
    sample_sizes = df['continent'].value_counts()
    print("\nSample size per continent:")
    print(sample_sizes)
    
    # Create the visualization
    plt.figure(figsize=(12, 6))
    
    # Calculate means and standard errors
    means = df.iloc[:, :9].groupby(df['continent']).mean()
    errors = df.iloc[:, :9].groupby(df['continent']).sem()
    
    # Plot for each continent
    x = range(9)
    for continent in means.index:
        plt.errorbar(x, 
                    means.loc[continent], 
                    yerr=errors.loc[continent],
                    fmt='o-',
                    capsize=3,
                    label=f"{continent} (n={sample_sizes[continent]})",
                    markersize=6)
    
    plt.xlabel('Question Number')
    plt.ylabel('Average Response')
    plt.title('Average Responses by Continent\nwith Standard Error Bars')
    plt.xticks(x, [f'Q{i+1}' for i in x])
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    return plt.gcf()

In [None]:
fig = plot_with_error_checking(df)
plt.show()