In [None]:
# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# 2️⃣ Load Dataset
df_togo = pd.read_csv("../data/togo_raw.csv")
print("Shape of Togo data:", df_togo.shape)
df_togo.head()

# 3️⃣ Basic Profiling
print(df_togo.describe())
print(df_togo.isna().sum())

missing_percent = df_togo.isna().mean() * 100
print("Columns with >5% missing:", missing_percent[missing_percent > 5])

# 4️⃣ Clean Data
# Fill numeric missing values with median
df_togo = df_togo.fillna(df_togo.median(numeric_only=True))

# Fill categorical missing values with mode
for col in df_togo.select_dtypes(include="object").columns:
    df_togo[col].fillna(df_togo[col].mode()[0], inplace=True)

# 5️⃣ Outlier Detection
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = np.abs(stats.zscore(df_togo[numeric_cols]))
df_togo_clean = df_togo[(z_scores < 3).all(axis=1)]
print("Shape after removing outliers:", df_togo_clean.shape)

# 6️⃣ Optional Visualizations
plt.figure(figsize=(10,6))
sns.kdeplot(df_togo_clean['GHI'], label='GHI')
sns.kdeplot(df_togo_clean['DNI'], label='DNI')
sns.kdeplot(df_togo_clean['DHI'], label='DHI')
plt.title("Togo Solar Metrics Distribution")
plt.legend()
plt.show()

# 7️⃣ Save Cleaned CSV
df_togo_clean.to_csv("../data/togo_clean.csv", index=False)
print("Cleaned Togo CSV saved to data/")
