In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import zscore, kruskal

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


# Comparing Solar Data Across Countries

This notebook analyzes and compares solar data from different countries in the dataset.

In [None]:
# Load the datasets
benin = pd.read_csv("../data/benin_clean.csv")
togo = pd.read_csv("../data/togo-dapaong_qc.csv")
sierra = pd.read_csv("../data/sierraleone-bumbuna.csv")

# Add country labels
benin["Country"] = "Benin"
togo["Country"] = "Togo"
sierra["Country"] = "Sierra Leone"

# Combine all datasets
df_all = pd.concat([benin, togo, sierra], ignore_index=True)


## Solar Radiation Comparison

Comparing GHI, DNI, and DHI distributions across countries using boxplots.

In [None]:
# Create boxplots for each solar radiation metric
for col in ["GHI", "DNI", "DHI"]:
 plt.figure(figsize=(8, 5))
 sns.boxplot(x="Country", y=col, data=df_all)
 plt.title(f"{col} Comparison Across Countries")
 plt.show()


## Statistical Summary

Calculate summary statistics (mean, median, standard deviation) for each solar radiation metric by country.

In [None]:
# Calculate summary statistics by country
summary = df_all.groupby("Country")[["GHI", "DNI", "DHI"]].agg(["mean", "median", "std"])
summary


## Statistical Tests

Perform Kruskal-Wallis test to check if there are statistically significant differences in solar radiation metrics between countries.

In [None]:
# Using Kruskal-Wallis (non-parametric)
ghi_test = kruskal(benin["GHI"], togo["GHI"], sierra["GHI"])
print(f"GHI Kruskal-Wallis H-statistic: {ghi_test.statistic}, p-value: {ghi_test.pvalue}")


In [None]:
# Bar Chart: Average GHI by Country

avg_ghi = df_all.groupby("Country")["GHI"].mean().sort_values()
avg_ghi.plot(kind='bar', title="Average GHI by Country", figsize=(8, 5))
plt.ylabel("GHI")
plt.show()
