In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
%matplotlib inline  # Ensures plots display in the notebook

In [None]:
df = pd.read_csv('data/benin.csv')

In [None]:
# Summary statistics
summary_stats = df.describe()
print(summary_stats)

# Missing values
missing_values = df.isna().sum()
print("\nMissing Values:\n", missing_values)

# Columns with >5% missing
missing_percentage = (missing_values / len(df)) * 100
high_missing = missing_percentage[missing_percentage > 5]
print("\nColumns with >5% missing values:\n", high_missing)

In [None]:
# Outlier detection using Z-scores
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[key_columns].apply(zscore, nan_policy='omit')
outliers = (z_scores.abs() > 3).any(axis=1)
print(f"Number of outlier rows (|Z|>3): {outliers.sum()}")
print("\nOutlier rows:\n", df[outliers][key_columns])

In [None]:
# Impute missing values with median
for col in key_columns:
    df[col] = df[col].fillna(df[col].median())

# Remove outliers (optional; keep if outliers are physically plausible)
df_clean = df[~outliers].copy()

# Verify cleaning
print("\nMissing values after cleaning:\n", df_clean.isna().sum())

In [None]:
df_clean.to_csv('data/benin_clean.csv', index=False)