# Benin Solar Data - Exploratory Data Analysis (EDA)

This notebook covers the full EDA process for Benin's solar dataset as part of the region-ranking project.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load dataset
df = pd.read_csv('data/benin.csv', parse_dates=['Timestamp'])
df.head()


## Summary Statistics & Missing Value Report

In [None]:

# Summary statistics
df.describe()


In [None]:

# Missing values report
missing = df.isna().sum()
missing[missing > 0].sort_values(ascending=False)


In [None]:

# Columns with >5% missing values
missing_percent = (df.isna().sum() / len(df)) * 100
missing_percent[missing_percent > 5]


## Outlier Detection & Basic Cleaning

In [None]:

cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[cols].apply(zscore)
(z_scores.abs() > 3).sum()


In [None]:

# Impute missing values with median
for col in cols:
    df[col].fillna(df[col].median(), inplace=True)

# Drop extreme outliers
df_clean = df[(z_scores.abs() <= 3).all(axis=1)]


## Time Series Analysis

In [None]:

plt.figure(figsize=(15, 5))
df_clean.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot()
plt.title('Solar Metrics Over Time')
plt.ylabel('Irradiance (W/m²) / Temperature (°C)')
plt.grid()


## Cleaning Impact on ModA & ModB

In [None]:

df['cleaned'] = (~(z_scores.abs() > 3)).all(axis=1)
df.groupby('cleaned')[['ModA', 'ModB']].mean().plot(kind='bar')
plt.title("ModA & ModB Averages Before vs After Cleaning")


## Correlation Matrix

In [None]:

corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")


## Scatter Plots

In [None]:

sns.scatterplot(data=df_clean, x='WS', y='GHI')
plt.title('Wind Speed vs GHI')


In [None]:

sns.scatterplot(data=df_clean, x='RH', y='Tamb')
plt.title('Humidity vs Ambient Temp')


## Wind Rose Plot

In [None]:

import numpy as np

ws = df_clean['WS']
wd = df_clean['WD']
theta = np.radians(wd)
r = ws

plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)
ax.scatter(theta, r, alpha=0.4)
ax.set_title('Wind Rose (WS vs WD)')


## Histograms

In [None]:

df_clean['GHI'].hist(bins=50)
plt.title('Distribution of GHI')


In [None]:

df_clean['WS'].hist(bins=50)
plt.title('Distribution of Wind Speed')


## Bubble Plot: GHI vs Tamb (Size = RH)

In [None]:

plt.figure(figsize=(10, 6))
plt.scatter(df_clean['Tamb'], df_clean['GHI'],
            s=df_clean['RH'], alpha=0.5, c=df_clean['RH'], cmap='viridis')
plt.xlabel('Temperature (Tamb)')
plt.ylabel('GHI')
plt.title('Bubble Plot: GHI vs Tamb (size = RH)')


## Export Cleaned Data

In [None]:

df_clean.to_csv('data/benin_clean.csv', index=False)


## Key Insights & Conclusion
- Outliers in ModB and WSgust cleaned using Z-score > 3.
- Missing values imputed using median for core features.
- Relative Humidity influences temperature and GHI values.
- Wind direction mostly concentrated in 3 directions.
- Cleaned data ready for model and region ranking.