In [17]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

In [22]:
# Set folder paths
# data_dir = os.path.join(script_dir, "data")
data_dir = os.path.abspath(os.path.join(script_dir, "..", "data"))

output_dir = os.path.abspath(os.path.join(script_dir,"..", "Output"))
plots_dir = os.path.join(output_dir, "plots")
summary_dir = os.path.join(output_dir, "summary")
processed_data_dir = os.path.join(output_dir, "processed")

In [23]:
# Create output directories if they don't exist
for folder in [output_dir, plots_dir, summary_dir, processed_data_dir]:
    os.makedirs(folder, exist_ok=True)

In [24]:
# Load datasets
files = ["benin-malanville.csv", "sierraleone-bumbuna.csv", "togo-dapaong_qc.csv"]
datasets = {}
for file in files:
    file_path = os.path.join(data_dir, file)
    print(f"Looking for file: {file_path}")
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    datasets[file.split(".")[0]] = pd.read_csv(file_path)

Looking for file: d:\KAIM\Moonlight-EDA-Dashboard\data\benin-malanville.csv
Looking for file: d:\KAIM\Moonlight-EDA-Dashboard\data\sierraleone-bumbuna.csv
Looking for file: d:\KAIM\Moonlight-EDA-Dashboard\data\togo-dapaong_qc.csv


In [25]:
# Function to save plots
def save_plot(fig, filename):
    fig.savefig(os.path.join(plots_dir, filename), bbox_inches="tight")
    plt.close(fig)


In [26]:
# Function for summary statistics
def generate_summary_statistics(df, filename):
    summary = df.describe().T
    summary["missing"] = df.isnull().sum()
    summary.to_csv(os.path.join(summary_dir, filename))
    return summary

In [27]:
# Perform EDA on each dataset
for name, df in datasets.items():
    print(f"Processing dataset: {name}")

Processing dataset: benin-malanville
Processing dataset: sierraleone-bumbuna
Processing dataset: togo-dapaong_qc


In [28]:
 # Summary statistics
summary = generate_summary_statistics(df, f"summary_{name}.csv")
print(summary)

                  count        mean         std    min    25%    50%      75%  \
GHI            525600.0  230.555040  322.532347  -12.7   -2.2    2.1  442.400   
DNI            525600.0  151.258469  250.956962    0.0    0.0    0.0  246.400   
DHI            525600.0  116.444352  156.520714    0.0    0.0    2.5  215.700   
ModA           525600.0  226.144375  317.346938    0.0    0.0    4.4  422.525   
ModB           525600.0  219.568588  307.932510    0.0    0.0    4.3  411.000   
Tamb           525600.0   27.751788    4.758023   14.9   24.2   27.2   31.100   
RH             525600.0   55.013160   28.778732    3.3   26.5   59.3   80.800   
WS             525600.0    2.368093    1.462668    0.0    1.4    2.2    3.200   
WSgust         525600.0    3.229490    1.882565    0.0    1.9    2.9    4.400   
WSstdev        525600.0    0.557740    0.268923    0.0    0.4    0.5    0.700   
WD             525600.0  161.741845   91.877217    0.0   74.8  199.1  233.500   
WDstdev        525600.0   10

In [29]:
# Data quality check
print("Checking for missing values...")
missing_values = df.isnull().sum()
print(missing_values)

Checking for missing values...
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64


In [30]:
print("Checking for outliers...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if not numeric_cols.empty:
    z_scores = np.abs(zscore(df[numeric_cols].dropna()))
    outliers = (z_scores > 3).sum(axis=0)
    print(outliers)

Checking for outliers...
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [31]:
# Time Series Analysis
time_cols = ["GHI", "DNI", "DHI", "Tamb"]
time_cols = [col for col in time_cols if col in df.columns]
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    fig, ax = plt.subplots(figsize=(10, 6))
    df[time_cols].plot(ax=ax, title=f"Time Series Analysis: {name}")
    save_plot(fig, f"time_series_{name}.png")

In [32]:
# Correlation Analysis
if not df[numeric_cols].empty:
    corr = df[numeric_cols].corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
    ax.set_title(f"Correlation Matrix: {name}")
    save_plot(fig, f"correlation_matrix_{name}.png")

In [33]:
# Wind Analysis
if "WS" in df.columns and "WD" in df.columns:
    fig, ax = plt.subplots(figsize=(8, 8))
    sns.histplot(df["WS"], kde=True, ax=ax)
    ax.set_title(f"Wind Speed Distribution: {name}")
    save_plot(fig, f"wind_speed_distribution_{name}.png")

In [34]:
# Histograms
fig, axes = plt.subplots(len(numeric_cols) // 3 + 1, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(numeric_cols):
    sns.histplot(df[col], kde=True, ax=axes[i])
    axes[i].set_title(f"Histogram: {col}")
save_plot(fig, f"histograms_{name}.png")

In [35]:
# Bubble Chart
bubble_vars = ["GHI", "Tamb", "WS", "RH"]
bubble_vars = [col for col in bubble_vars if col in df.columns]
if len(bubble_vars) >= 3:
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.scatterplot(
        x=bubble_vars[0],
        y=bubble_vars[1],
        size=bubble_vars[2],
        hue=bubble_vars[2],
        sizes=(40, 400),
        data=df,
        ax=ax,
    )
    ax.set_title(f"Bubble Chart: {name}")
    save_plot(fig, f"bubble_chart_{name}.png")

In [36]:
# Data Cleaning
if "Comments" in df.columns:
    df.drop(columns=["Comments"], inplace=True)
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)
processed_file_path = os.path.join(processed_data_dir, f"cleaned_{name}.csv")
df.to_csv(processed_file_path, index=False)

print(f"Finished processing dataset: {name}\n")

  df.fillna(method="ffill", inplace=True)
  df.fillna(method="bfill", inplace=True)


Finished processing dataset: togo-dapaong_qc

