In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load data
df = pd.read_csv('../data/benin.csv')

# Handle missing values
missing = df.isna().sum()
threshold = 0.05 * len(df)
cols_to_drop = missing[missing > threshold].index
df_clean = df.drop(columns=cols_to_drop)

# Impute median for key columns
key_cols = ['GHI', 'DNI', 'DHI', 'WS']
for col in key_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Remove outliers using Z-scores
z_scores = np.abs(stats.zscore(df_clean[key_cols]))
df_clean = df_clean[(z_scores < 3).all(axis=1)]

# Time series plot
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])
df_clean.set_index('Timestamp', inplace=True)
df_clean['GHI'].resample('D').mean().plot(title='Daily GHI Trends')
plt.show()

# Export cleaned data
df_clean.to_csv('../data/benin_clean.csv', index=False)