### Installing Important Libraries for EDA

In [None]:
#!pip install pandas
#!pip install numpy
#!pip install scipy
#!pip install seaborn
#!pip install plotly
#!pip install --upgrade nbformat

### Importing Fundamental Libraries

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import zscore
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import os

In [4]:
print(os.listdir())

['Benin_eda.ipynb', 'Sierraleone_eda.ipynb', 'Togo_eda.ipynb']


### Dataset Loading for Togo

In [9]:
#df  = pd.read_csv('data/togo-dapaong_qc.csv', parse_dates=["Timestamp"])
#df.info()

### Summary Statistics & Missing-Value Report

In [None]:
# Overview of numeric columns
print("The Description of the numeric columns:")
print(df.describe().T)

# Overview of all columns
print("---------------------------------")
print("The Description of all columns:")
print(df.describe(include='all'))

# Check for missing values
print("----------------------------------")
print("Check for missing values:")
print(df.isna().sum()/len(df)*100)

In [None]:
# Missing values
null_report = df.isna().sum()
null_report[null_report > 0]

# Flagging columns with >5% nulls
threshold = 0.05 * len(df)
null_report[null_report > threshold]

### Outlier Detection & Basic Cleaning

In [None]:
# Columns of interest
cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]

# Compute Z-scores
z_scores = df[cols].apply(zscore)
outliers = (np.abs(z_scores) > 3).any(axis=1)

# Flag outliers
df["outlier_flag"] = outliers
print(f"Number of outlier samples flagged: {outliers.sum()}")

In [None]:
# Clean: drop or impute
df_clean = df.copy()
df_clean[cols] = df_clean[cols].fillna(df_clean[cols].median())
df_clean = df_clean[~df_clean["outlier_flag"]]

Export Cleaned Data

In [None]:
df_clean.to_csv("data/Togo_clean.csv", index=False)
df_clean.info()

Time Series Analysis

In [None]:
# Line plots
plt.figure(figsize=(12, 5))
df_clean.set_index("Timestamp")[["GHI", "DNI", "DHI", "Tamb"]].plot(subplots=True)
plt.suptitle("Time Series of Solar and Temperature Metrics")
plt.show()

Cleaning Impact

Group by Cleaning flag and plot average ModA & ModB pre/post-clean

In [None]:
# Compare ModA/ModB before and after cleaning
df["cleaning_flag"] = np.where(df["outlier_flag"], "Outlier", "Clean")
df.groupby("cleaning_flag")[["ModA", "ModB"]].mean().plot(kind="bar")
plt.title("ModA & ModB Before/After Cleaning")
plt.show()

Correlation & Relationship Analysis


Heatmap Correlation

In [None]:
cor_cols = ["GHI", "DNI", "DHI", "TModA", "TModB"]

# Compute correlations
corr = df[cor_cols].corr()

# Visualize
sns.heatmap(corr, annot=True, cmap="coolwarm", square=True)
plt.title("Correlation Matrix of Benin Solar Energy")
plt.show()

In [None]:
sns.scatterplot(x="WS", y="GHI", data=df_clean)
plt.title("Wind Speed vs. GHI")
plt.show()

sns.scatterplot(x="RH", y="Tamb", data=df_clean)
plt.title("Relative Humidity vs. Ambient Temperature")
plt.show()

Wind & Distribution Analysis

In [None]:
df_clean["WD_bin"] = pd.cut(df_clean["WD"], bins=12)
wind_counts = df_clean.groupby("WD_bin")["WS"].mean()

wind_counts.plot(kind="bar", title="Average Wind Speed per Direction Bin")
plt.ylabel("Average Wind Speed")
plt.show()

df_clean["GHI"].plot(kind="hist", bins=30, alpha=0.7)
plt.title("Distribution of GHI")
plt.xlabel("GHI")
plt.show()

Temperature Analysis

In [None]:
sns.scatterplot(x="RH", y="Tamb", data=df_clean)
plt.title("RH vs. Temperature")
plt.show()

sns.scatterplot(x="RH", y="GHI", data=df_clean)
plt.title("RH vs. GHI")
plt.show()

Bubble Chart

GHI vs. Tamb with bubble size = RH or BP

In [None]:
fig = px.scatter(df_clean, x="GHI", y="Tamb", size="RH", color="RH",
                 title="GHI vs. Tamb (Bubble Size = RH)", size_max=30)
#fig.show()