In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use('seaborn-v0_8')
sns.set_palette("viridis")


In [None]:

df = pd.read_csv("../data/ethiopia.csv", parse_dates=["Timestamp"])
print("Shape:", df.shape)
df.info()
df.head()


In [None]:

print("Summary Statistics:")
display(df.describe().T)

print("\nMissing Values (%):")
missing = df.isna().mean().sort_values(ascending=False) * 100
display(missing[missing > 0])


In [None]:

df.drop_duplicates(inplace=True)
df = df[df["GHI"] >= 0]  # remove negative solar values
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

df.fillna(df.median(numeric_only=True), inplace=True)

print("Cleaned data shape:", df.shape)


In [None]:
cols = ["GHI","DNI","DHI","ModA","ModB","WS","WSgust"]
present = [c for c in cols if c in df.columns]

z = np.abs(stats.zscore(df[present], nan_policy='omit'))
outliers = (z > 3).any(axis=1)
print("Outliers detected:", outliers.sum())

df = df[~outliers]


In [None]:

plt.figure(figsize=(10,4))
sns.lineplot(data=df, x="Timestamp", y="GHI")
plt.title("Global Horizontal Irradiance (GHI) Over Time")
plt.xlabel("Date")
plt.ylabel("GHI (W/m²)")
plt.show()

df["month"] = df["Timestamp"].dt.month
sns.boxplot(data=df, x="month", y="GHI")
plt.title("GHI Distribution by Month")
plt.show()


In [None]:

if "Cleaning" in df.columns:
    avg_mods = df.groupby("Cleaning")[["ModA","ModB"]].mean()
    print(avg_mods)
    sns.barplot(x="Cleaning", y="ModA", data=df)
    plt.title("Module A Average Output: Pre vs Post Cleaning")
    plt.show()

if "Cleaning" in df.columns:
    avg_mods = df.groupby("Cleaning")[["ModA","ModB"]].mean()
    print(avg_mods)
    sns.barplot(x="Cleaning", y="ModA", data=df)
    plt.title("Module A Average Output: Pre vs Post Cleaning")
    plt.show()


In [None]:

corr_cols = ["GHI","DNI","DHI","TModA","TModB","Tamb","RH"]
present_corr = [c for c in corr_cols if c in df.columns]

plt.figure(figsize=(8,6))
sns.heatmap(df[present_corr].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

sns.scatterplot(x="WS", y="GHI", data=df, alpha=0.3)
plt.title("Wind Speed vs GHI")
plt.show()


In [None]:

fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.histplot(df["GHI"], bins=40, kde=True, ax=axes[0])
axes[0].set_title("GHI Distribution")
sns.histplot(df["WS"], bins=40, kde=True, ax=axes[1])
axes[1].set_title("Wind Speed Distribution")
plt.tight_layout()
plt.show()


In [None]:

if {"Tamb","GHI","RH"}.issubset(df.columns):
    plt.figure(figsize=(6,5))
    plt.scatter(df["Tamb"], df["GHI"], s=df["RH"]/2, alpha=0.5)
    plt.title("GHI vs Tamb (bubble = RH)")
    plt.xlabel("Ambient Temp (°C)")
    plt.ylabel("GHI (W/m²)")
    plt.show()


In [None]:

out_path = "../data/ethiopia_clean.csv"
df.to_csv(out_path, index=False)
print("✅ Cleaned data saved to:", out_path)
