Imports

In [1]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway, kruskal
import os

Environment Configuration

In [2]:
CONFIG = {
    'countries': ['benin', 'sierra_leone', 'togo'],
    'data_files': {
        'benin': './data/benin_clean.csv',
        'sierra_leone': './data/sierraleone_clean.csv',
        'togo': './data/togo_clean.csv'
    },
    'metrics': ['GHI', 'DNI', 'DHI'],
    'output_dir': './data/compare_countries'
}

Data Loading

In [4]:
def load_cleaned_data() -> dict:
    """Load cleaned datasets for each country."""
    data = {}
    for country, file_path in CONFIG['data_files'].items():
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Cleaned data not found at {file_path}")
        data[country] = pd.read_csv(file_path, parse_dates=['Timestamp'])
    return data

Summary table

In [5]:
def compute_summary_table(data: dict) -> pd.DataFrame:
    """Compute summary table with mean, median, and SD for GHI, DNI, DHI across countries.

    Args:
        data (dict): Dictionary of DataFrames for each country.

    Returns:
        pd.DataFrame: Summary table with statistics.
    """
    summary = []
    for country, df in data.items():
        for metric in CONFIG['metrics']:
            if metric in df.columns:
                summary.append({
                    'Country': country,
                    'Metric': metric,
                    'Mean': df[metric].mean().round(2),
                    'Median': df[metric].median().round(2),
                    'Std': df[metric].std().round(2)
                })
    return pd.DataFrame(summary)

Statistical test

In [6]:
def perform_statistical_test(data: dict, metric: str) -> tuple:
    """Perform statistical test (ANOVA or Kruskal-Wallis) on a metric across countries.

    Args:
        data (dict): Dictionary of DataFrames for each country.
        metric (str): Metric to test (e.g., 'GHI').

    Returns:
        tuple: Test name and p-value.
    """
    metric_data = [df[metric].dropna() for df in data.values() if metric in df.columns]
    if len(metric_data) != len(data):
        return None, None
    
    # Check for normality (optional, for choosing test)
    normality = all(stats.shapiro(df)[1] > 0.05 for df in metric_data)
    
    if normality:
        # Use ANOVA if data is approximately normal
        stat, p_value = f_oneway(*metric_data)
        test_name = 'ANOVA'
    else:
        # Use Kruskal-Wallis for non-normal data
        stat, p_value = kruskal(*metric_data)
        test_name = 'Kruskal-Wallis'
    
    return test_name, p_value

Box plots

In [8]:
def plot_boxplots(data: dict):
    """Create side-by-side boxplots for GHI, DNI, DHI across countries.

    Args:
        data (dict): Dictionary of DataFrames for each country.
    """
    os.makedirs(CONFIG['output_dir'], exist_ok=True)
    
    for metric in CONFIG['metrics']:
        plt.figure(figsize=(10, 6))
        plot_data = []
        for country, df in data.items():
            if metric in df.columns:
                temp = df[[metric]].copy()
                temp['Country'] = country
                plot_data.append(temp)
        if plot_data:
            plot_df = pd.concat(plot_data)
            sns.boxplot(x='Country', y=metric, data=plot_df, palette='Set2')
            plt.title(f'{metric} Distribution Across Countries')
            plt.ylabel(f'{metric} (W/m²)')
            plt.savefig(f"{CONFIG['output_dir']}/{metric}_boxplot.png")
            plt.close()

GHI ranking

In [9]:
def plot_ghi_ranking(data: dict):
    """Create a bar chart ranking countries by average GHI.

    Args:
        data (dict): Dictionary of DataFrames for each country.
    """
    means = []
    for country, df in data.items():
        if 'GHI' in df.columns:
            means.append({'Country': country, 'Mean GHI': df['GHI'].mean()})
    df_means = pd.DataFrame(means).sort_values('Mean GHI', ascending=False)
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x='Mean GHI', y='Country', data=df_means, palette='Blues_d')
    plt.title('Ranking of Countries by Average GHI')
    plt.xlabel('Mean GHI (W/m²)')
    plt.savefig(f"{CONFIG['output_dir']}/ghi_ranking.png")
    plt.close()

Execution

In [10]:
def main():
    """Execute the cross-country comparison pipeline."""
    try:
        # Load cleaned data
        data = load_cleaned_data()
        
        # Compute and display summary table
        summary_table = compute_summary_table(data)
        print("Summary Table:")
        print(summary_table)
        summary_table.to_csv(f"{CONFIG['output_dir']}/summary_table.csv", index=False)
        
        # Perform statistical tests
        for metric in CONFIG['metrics']:
            test_name, p_value = perform_statistical_test(data, metric)
            if test_name:
                print(f"{metric} {test_name} p-value: {p_value:.4f}")
        
        # Generate boxplots
        plot_boxplots(data)
        
        # Generate GHI ranking bar chart (bonus)
        plot_ghi_ranking(data)
        
        # Key observations (markdown cell equivalent)
        observations = [
            f"{summary_table.loc[summary_table['Metric'] == 'GHI', 'Mean'].idxmax() + 1}: "
            f"{summary_table.loc[summary_table['Metric'] == 'GHI', 'Country'].iloc[0]} has the highest mean GHI "
            f"({summary_table.loc[summary_table['Metric'] == 'GHI', 'Mean'].max():.2f} W/m²), indicating strong solar potential.",
            f"Variability: {summary_table.loc[summary_table['Metric'] == 'GHI', 'Std'].idxmax() + 1}: "
            f"{summary_table.loc[summary_table['Metric'] == 'GHI', 'Country'].iloc[0]} shows the highest GHI variability "
            f"(SD = {summary_table.loc[summary_table['Metric'] == 'GHI', 'Std'].max():.2f}), suggesting less predictable solar output.",
            f"Statistical significance: GHI differences across countries have a p-value of "
            f"{perform_statistical_test(data, 'GHI')[1]:.4f}, indicating "
            f"{'significant' if perform_statistical_test(data, 'GHI')[1] < 0.05 else 'no significant'} differences."
        ]
        print("\nKey Observations:")
        for obs in observations:
            print(f"- {obs}")
        
        print(f"Outputs saved in {CONFIG['output_dir']}")
    
    except Exception as e:
        print(f"Error: {str(e)}")

# Run the pipeline
main()

Summary Table:
        Country Metric    Mean  Median     Std
0         benin    GHI  241.96     1.8  330.10
1         benin    DNI  167.44     0.0  261.55
2         benin    DHI  116.99     1.6  157.46
3  sierra_leone    GHI  204.41     0.3  296.79
4  sierra_leone    DNI  116.52     0.0  218.57
5  sierra_leone    DHI  116.27     0.0  157.02
6          togo    GHI  231.72     2.1  321.69
7          togo    DNI  151.26     0.0  250.96
8          togo    DHI  116.44     2.5  156.52
GHI Kruskal-Wallis p-value: 0.0000


  res = hypotest_fun_out(*samples, **kwds)


DNI Kruskal-Wallis p-value: 0.0000
DHI Kruskal-Wallis p-value: 0.0000



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Country', y=metric, data=plot_df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Country', y=metric, data=plot_df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Country', y=metric, data=plot_df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Mean GHI', y='Country', data=df_means, palette='Blues_d')
  res = hypotest_fun_out(*samples, **kwds)



Key Observations:
- 1: benin has the highest mean GHI (241.96 W/m²), indicating strong solar potential.
- Variability: 1: benin shows the highest GHI variability (SD = 330.10), suggesting less predictable solar output.
- Statistical significance: GHI differences across countries have a p-value of 0.0000, indicating significant differences.
Outputs saved in ./data/compare_countries
