In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [None]:
# plotting styles
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10,6)

In [None]:
# load datasets
paths = {
    "Benin": "../data/cleaned_data_benin.csv",
    "SierraLeone": "../data/cleaned_data_sierraleone.csv",
    "Togo": "../data/cleaned_data_togo.csv"
}

dfs = {}
for country, path in paths.items():
    df = pd.read_csv(path)
    # ensure Timestamp is datetime if present
    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    df["country"] = country  # add column to identify country after concatenation
    dfs[country] = df

# quick sanity
for k,v in dfs.items():
    print(k, v.shape)


In [None]:
# Cell 3: concat for plotting & summary
df_all = pd.concat(list(dfs.values()), ignore_index=True)
df_all.sample(10)

In [None]:
# Cell 4: boxplots for GHI, DNI, DHI
metrics = ["GHI", "DNI", "DHI"]
for metric in metrics:
    if metric in df_all.columns:
        plt.figure(figsize=(8,5))
        sns.boxplot(x="country", y=metric, data=df_all, hue = "country",palette="coolwarm")
        plt.title(f"Distribution of {metric} by Country")
        plt.ylabel(metric)
        plt.xlabel("Country")
        plt.tight_layout()
        plt.show()
    else:
        print(f"{metric} not found in data columns.")


In [25]:
# Cell 5: summary table
metrics = ["GHI", "DNI", "DHI"]
summary = df_all.groupby("country")[metrics].agg(['mean','median','std']).round(3)
# flatten column multiindex
summary.columns = ["_".join(col).strip() for col in summary.columns.values]
summary = summary.reset_index()
summary


Unnamed: 0,country,GHI_mean,GHI_median,GHI_std,DNI_mean,DNI_median,DNI_std,DHI_mean,DHI_median,DHI_std
0,Benin,229.305,-0.4,324.307,162.115,-0.2,259.359,108.953,-0.5,151.921
1,SierraLeone,176.929,-0.5,272.842,97.928,-0.1,195.284,105.307,-0.7,153.04
2,Togo,218.728,-0.2,313.973,144.591,0.0,245.334,110.779,0.8,150.658


In [26]:
# Cell 6: tests on GHI
metric = "GHI"
available_countries = [c for c in paths.keys() if metric in dfs[c].columns]

# prepare samples
samples = [dfs[c][metric].dropna().values for c in available_countries]

# 1) normality test (Shapiro) - note: Shapiro may be sensitive for large n; interpret carefully
norm_results = {}
for c in available_countries:
    vals = dfs[c][metric].dropna().sample(n=min(5000, dfs[c][metric].dropna().shape[0]), random_state=0)  # sample to reasonable size
    stat, p = stats.shapiro(vals) if len(vals) <= 5000 else (np.nan, np.nan)  # Shapiro requires n <= 5000
    norm_results[c] = {"W": stat, "pvalue": p}
pd.DataFrame(norm_results).T

# 2) homogeneity of variances (Levene)
levene_stat, levene_p = stats.levene(*samples, center='median')
print(f"Levene test: stat={levene_stat:.4f}, p={levene_p:.4g}")

# 3) choose test
if (all((v["pvalue"] is not np.nan and v["pvalue"] > 0.05) for v in norm_results.values()) 
    and levene_p > 0.05):
    print("Data approximately normal and variances equal → run one-way ANOVA")
    f_stat, p_anova = stats.f_oneway(*samples)
    print(f"ANOVA result: F={f_stat:.4f}, p={p_anova:.4g}")
else:
    print("Normality/variance assumptions not met → run Kruskal-Wallis")
    h_stat, p_kw = stats.kruskal(*samples)
    print(f"Kruskal-Wallis result: H={h_stat:.4f}, p={p_kw:.4g}")


Levene test: stat=3777.8335, p=0
Normality/variance assumptions not met → run Kruskal-Wallis
Kruskal-Wallis result: H=6925.2798, p=0


In [27]:
# Cell 7: post-hoc (Tukey) if ANOVA used
try:
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    if metric in df_all.columns:
        tukey = pairwise_tukeyhsd(endog=df_all[metric].dropna(), groups=df_all.loc[df_all[metric].notna(), "country"], alpha=0.05)
        print(tukey.summary())
except Exception as e:
    print("Tukey test unavailable (statsmodels not installed) or error:", e)


      Multiple Comparison of Means - Tukey HSD, FWER=0.05      
   group1      group2   meandiff p-adj  lower    upper   reject
---------------------------------------------------------------
      Benin SierraLeone -52.3755   0.0 -53.8025 -50.9486   True
      Benin        Togo -10.5763   0.0 -11.9956  -9.1571   True
SierraLeone        Togo  41.7992   0.0  40.3706  43.2278   True
---------------------------------------------------------------


In [None]:
# Cell 8: bar chart of mean GHI
if "GHI" in df_all.columns:
    mean_ghi = df_all.groupby("country")["GHI"].mean().sort_values(ascending=False)
    ax = mean_ghi.plot(kind="bar", color=["#4c72b0","#55a868","#c44e52"])
    plt.ylabel("Average GHI")
    plt.title("Average GHI by Country (ranked)")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()
