In [None]:
# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# 2️⃣ Load Dataset
df_sierra = pd.read_csv("../data/sierra_raw.csv")
print("Shape of Sierra Leone data:", df_sierra.shape)
df_sierra.head()

# 3️⃣ Basic Profiling
print(df_sierra.describe())
print(df_sierra.isna().sum())

missing_percent = df_sierra.isna().mean() * 100
print("Columns with >5% missing:", missing_percent[missing_percent > 5])

# 4️⃣ Clean Data
# Fill numeric missing values with median
df_sierra = df_sierra.fillna(df_sierra.median(numeric_only=True))

# Fill categorical missing values with mode
for col in df_sierra.select_dtypes(include="object").columns:
    df_sierra[col].fillna(df_sierra[col].mode()[0], inplace=True)

# 5️⃣ Outlier Detection
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = np.abs(stats.zscore(df_sierra[numeric_cols]))
df_sierra_clean = df_sierra[(z_scores < 3).all(axis=1)]
print("Shape after removing outliers:", df_sierra_clean.shape)

# 6️⃣ Optional Visualizations
plt.figure(figsize=(10,6))
sns.kdeplot(df_sierra_clean['GHI'], label='GHI')
sns.kdeplot(df_sierra_clean['DNI'], label='DNI')
sns.kdeplot(df_sierra_clean['DHI'], label='DHI')
plt.title("Sierra Leone Solar Metrics Distribution")
plt.legend()
plt.show()

# 7️⃣ Save Cleaned CSV
df_sierra_clean.to_csv("../data/sierra_clean.csv", index=False)
print("Cleaned Sierra Leone CSV saved to data/")
