In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load data
df = pd.read_csv("data/benin.csv")  # make sure the file is there
df.head()


In [None]:
# Summary statistics
summary = df.describe()
print("Summary Statistics:")
display(summary)

# Missing value report
missing = df.isna().sum()
missing_percent = (missing / len(df)) * 100
print("Columns with >5% missing:")
display(missing_percent[missing_percent > 5])


In [None]:
# Columns to check for outliers
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Calculate Z-scores (ignoring missing values)
z_scores = np.abs(zscore(df[cols_to_check].dropna()))

# Flag rows with any column having Z > 3
outlier_flags = (z_scores > 3).any(axis=1)

# Mark in original dataframe
df['outlier_flag'] = False
df.loc[df[cols_to_check].dropna().index, 'outlier_flag'] = outlier_flags


In [None]:
# Create a copy to clean
df_clean = df.copy()

# Fill missing values with column medians
df_clean.fillna(df_clean.median(numeric_only=True), inplace=True)

# Remove outlier rows
df_clean = df_clean[df_clean['outlier_flag'] == False]

# Preview cleaned data
df_clean.head()


In [None]:
# Export cleaned dataset to CSV (inside data/ folder)
df_clean.to_csv('data/benin_clean.csv', index=False)

print("✅ Cleaned data saved as: data/benin_clean.csv")


In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])

df_clean.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot(figsize=(12, 5))
plt.title("Daily Solar Measurements Over Time")
plt.ylabel("Measurement")
plt.xlabel("Time")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df_clean[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap of Solar & Weather Variables")
plt.show()


In [None]:
sns.boxplot(x=df['outlier_flag'], y=df['ModA'])
plt.title("ModA Before (True) and After (False) Cleaning")
plt.xlabel("Outlier Flag")
plt.ylabel("ModA")
plt.show()


In [None]:
sns.scatterplot(x='WS', y='GHI', data=df_clean)
plt.title("Wind Speed vs Global Horizontal Irradiance (GHI)")
plt.xlabel("Wind Speed")
plt.ylabel("GHI")
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df_clean['GHI'], kde=True, bins=30)
plt.title("Distribution of GHI Values")
plt.xlabel("GHI")
plt.ylabel("Frequency")
plt.show()


In [None]:
sns.scatterplot(x='RH', y='Tamb', data=df_clean)
plt.title("Humidity vs Ambient Temperature")
plt.xlabel("Relative Humidity (%)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df_clean['GHI'], df_clean['Tamb'], s=df_clean['RH'], alpha=0.5)
plt.title("GHI vs Tamb with RH as Bubble Size")
plt.xlabel("GHI")
plt.ylabel("Tamb")
plt.grid(True)
plt.show()
