In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
%matplotlib inline  # Ensures plots display in the notebook

In [None]:
df = pd.read_csv('data/benin.csv')

In [None]:
# Summary statistics
summary_stats = df.describe()
print(summary_stats)

# Missing values
missing_values = df.isna().sum()
print("\nMissing Values:\n", missing_values)

# Columns with >5% missing
missing_percentage = (missing_values / len(df)) * 100
high_missing = missing_percentage[missing_percentage > 5]
print("\nColumns with >5% missing values:\n", high_missing)

In [None]:
# Outlier detection using Z-scores
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[key_columns].apply(zscore, nan_policy='omit')
outliers = (z_scores.abs() > 3).any(axis=1)
print(f"Number of outlier rows (|Z|>3): {outliers.sum()}")
print("\nOutlier rows:\n", df[outliers][key_columns])

In [None]:
# Impute missing values with median
for col in key_columns:
    df[col] = df[col].fillna(df[col].median())

# Remove outliers (optional; keep if outliers are physically plausible)
df_clean = df[~outliers].copy()

# Verify cleaning
print("\nMissing values after cleaning:\n", df_clean.isna().sum())

In [None]:
df_clean.to_csv('data/benin_clean.csv', index=False)

In [None]:
# Convert Timestamp to datetime
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])

# Plot GHI, DNI, DHI
plt.figure(figsize=(12, 6))
plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI', alpha=0.7)
plt.plot(df_clean['Timestamp'], df_clean['DNI'], label='DNI', alpha=0.7)
plt.plot(df_clean['Timestamp'], df_clean['DHI'], label='DHI', alpha=0.7)
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/m²)')
plt.title('Solar Irradiance Over Time (Benin)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('plots/benin_irradiance_time_series.png')
plt.show()

# Plot Tamb
plt.figure(figsize=(12, 6))
plt.plot(df_clean['Timestamp'], df_clean['Tamb'], label='Ambient Temperature', color='orange')
plt.xlabel('Timestamp')
plt.ylabel('Temperature (°C)')
plt.title('Ambient Temperature Over Time (Benin)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('plots/benin_temp_time_series.png')
plt.show()

In [None]:
# Extract month and plot average GHI
df_clean['Month'] = df_clean['Timestamp'].dt.month
monthly_ghi = df_clean.groupby('Month')['GHI'].mean()

plt.figure(figsize=(8, 5))
monthly_ghi.plot(kind='bar', color='skyblue')
plt.xlabel('Month')
plt.ylabel('Average GHI (W/m²)')
plt.title('Average GHI by Month (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_monthly_ghi.png')
plt.show()

In [None]:
# Group by Cleaning flag
cleaning_impact = df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean()
print("Average ModA and ModB by Cleaning Status:\n", cleaning_impact)

# Plot
cleaning_impact.plot(kind='bar', figsize=(8, 5), color=['blue', 'green'])
plt.xlabel('Cleaning (0 = No, 1 = Yes)')
plt.ylabel('Average Sensor Reading (W/m²)')
plt.title('Impact of Cleaning on ModA and ModB (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_cleaning_impact.png')
plt.show()

In [None]:
# Correlation heatmap
corr_columns = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
corr_matrix = df_clean[corr_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_correlation_heatmap.png')
plt.show()

In [None]:
# WS vs GHI
plt.figure(figsize=(8, 5))
plt.scatter(df_clean['WS'], df_clean['GHI'], alpha=0.5, color='purple')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('GHI (W/m²)')
plt.title('Wind Speed vs. GHI (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_ws_ghi_scatter.png')
plt.show()

# RH vs Tamb
plt.figure(figsize=(8, 5))
plt.scatter(df_clean['RH'], df_clean['Tamb'], alpha=0.5, color='teal')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.title('RH vs. Tamb (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_rh_tamb_scatter.png')
plt.show()

In [None]:
from windrose import WindroseAxes

# Wind rose plot
ax = WindroseAxes.from_ax()
ax.bar(df_clean['WD'], df_clean['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.title('Wind Rose Plot (Benin)')
plt.savefig('plots/benin_wind_rose.png')
plt.show()

In [None]:
# Histograms for GHI and WS
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(df_clean['GHI'], bins=30, edgecolor='black', color='lightblue')
plt.xlabel('GHI (W/m²)')
plt.ylabel('Frequency')
plt.title('GHI Distribution (Benin)')

plt.subplot(1, 2, 2)
plt.hist(df_clean['WS'], bins=30, edgecolor='black', color='lightgreen')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Frequency')
plt.title('Wind Speed Distribution (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_histograms.png')
plt.show()

In [None]:
# RH vs GHI scatter plot
plt.figure(figsize=(8, 5))
plt.scatter(df_clean['RH'], df_clean['GHI'], alpha=0.5, color='coral')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('GHI (W/m²)')
plt.title('RH vs. GHI (Benin)')
plt.tight_layout()
plt.savefig('plots/benin_rh_ghi_scatter.png')
plt.show()

### Temperature Analysis
- RH vs. GHI: Higher relative humidity often correlates with lower GHI, likely due to cloud cover reducing solar irradiance.
- RH vs. Tamb: Lower temperatures are associated with higher humidity, indicating possible rainy or overcast conditions.