In [None]:
# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# 2️⃣ Load Dataset
df = pd.read_csv("../data/benin_raw.csv")
print("Shape:", df.shape)
df.head()

# 3️⃣ Basic Profiling
df.describe()
df.isna().sum()
missing_percent = df.isna().mean() * 100
missing_percent[missing_percent > 5]

# 4️⃣ Clean Data
# Fill numeric missing values with median
df = df.fillna(df.median(numeric_only=True))
# Fill categorical missing values with mode
for col in df.select_dtypes(include="object").columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# 5️⃣ Outlier Detection
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df_clean = df[(z_scores < 3).all(axis=1)]
print("Shape after removing outliers:", df_clean.shape)

# 6️⃣ Visualizations
# Distribution of GHI, DNI, DHI
plt.figure(figsize=(10,6))
for col in ['GHI','DNI','DHI']:
    sns.kdeplot(df_clean[col], label=col)
plt.title("Distribution of Solar Metrics")
plt.legend()
plt.show()

# Time series plot
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])
plt.figure(figsize=(12,5))
sns.lineplot(x='Timestamp', y='GHI', data=df_clean)
plt.title("GHI Over Time")
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df_clean[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Between Solar Metrics")
plt.show()

# Scatter plot example
sns.scatterplot(x='WS', y='GHI', data=df_clean)
plt.title("Wind Speed vs GHI")
plt.show()

# 7️⃣ Save Cleaned Data
df_clean.to_csv("../data/benin_clean.csv", index=False)
print("Cleaned file saved to data/ folder.")


Shape: (525600, 19)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Shape after removing outliers: (517860, 19)
