# Exploratory Data Analysis for Solar Data Discovery Challenge (Togo Dataset)

This notebook performs data profiling, cleaning, and exploratory data analysis (EDA) on the Togo solar radiation dataset (`data/togo-dapaong_qc.csv`). It is designed for modularity and reusability, enabling analysis of Benin and Sierra Leone datasets by updating the configuration. The script addresses feedback on:

- **Code modularity**: Organized into functions for loading, profiling, cleaning, and EDA.
- **Reusability**: Configuration dictionary for dataset-specific parameters.
- **Documentation**: Comprehensive docstrings and comments.
- **Version control**: Demonstrated through Git commits and PRs.
- **Advanced functionality**: Plans for cross-country comparison and dashboard development.

**Outputs**:
- Cleaned dataset: `data/togo_clean.csv`
- Visualizations: `data/plots/` (time series, correlation heatmap, wind rose, bubble chart)

Imports

In [13]:
# Import required libraries for data processing and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from windrose import WindroseAxes 
import os

Setting the country name and creating the data directory

In [22]:
# Configuration dictionary for reusability across datasets
CONFIG = {
    'country': 'togo',
    'raw_data_file': '../data/togo-dapaong_qc.csv',
    'cleaned_data_file': 'data/togo_clean.csv',
    'numeric_cols': ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WD', 'TModA', 'TModB'],
    'irradiance_cols': ['GHI', 'DNI', 'DHI', 'ModA', 'ModB']
}

Loading the data

In [23]:
def load_data(file_path: str) -> pd.DataFrame:
    """Load dataset from CSV, validating file existence and timestamp format.

    Args:
        file_path (str): Path to the raw CSV file.

    Returns:
        pd.DataFrame: Loaded dataset with parsed timestamps.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset not found at {file_path}")
    df = pd.read_csv(file_path, parse_dates=['Timestamp'])
    return df

Profile Data Function

In [24]:
def profile_data(df: pd.DataFrame) -> dict:
    """Profile dataset, computing statistics, missing values, and quality issues.

    Args:
        df (pd.DataFrame): Input dataset.

    Returns:
        dict: Profiling results (statistics, missing values, negative counts).
    """
    # Summary statistics for all columns
    stats_summary = df.describe(include='all')
    
    # Missing values count and percentage
    missing_counts = df.isna().sum()
    missing_percent = (missing_counts / len(df) * 100).round(2)
    
    # Count negative values in irradiance columns
    negative_counts = {col: (df[col] < 0).sum() for col in CONFIG['irradiance_cols']}
    
    # Count outliers (Z-scores > 3) in numeric columns
    outlier_counts = {}
    for col in CONFIG['numeric_cols']:
        if col in df.columns:
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            outlier_counts[col] = (z_scores > 3).sum()
    
    return {
        'statistics': stats_summary,
        'missing_counts': missing_counts,
        'missing_percent': missing_percent,
        'negative_counts': negative_counts,
        'outlier_counts': outlier_counts
    }

Cleaning data function

In [25]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean dataset by handling negative values, missing data, and outliers.

    Args:
        df (pd.DataFrame): Input dataset.

    Returns:
        pd.DataFrame: Cleaned dataset with outlier flags.
    """
    df_clean = df.copy()
    
    # Clip negative irradiance values to 0
    for col in CONFIG['irradiance_cols']:
        if col in df_clean.columns:
            df_clean[col] = np.maximum(df_clean[col], 0)
    
    # Impute missing numeric values with median
    for col in CONFIG['numeric_cols']:
        if col in df_clean.columns and df_clean[col].isna().any():
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    
    # Drop rows with missing Timestamp
    df_clean = df_clean.dropna(subset=['Timestamp'])
    
    # Flag outliers (Z-scores > 3)
    df_clean['outlier_flag'] = False
    for col in CONFIG['numeric_cols']:
        if col in df_clean.columns:
            z_scores = np.abs(stats.zscore(df_clean[col]))
            df_clean['outlier_flag'] |= (z_scores > 3)
    
    return df_clean

Perform EDA function

In [26]:
def perform_eda(df: pd.DataFrame, output_dir: str):
    """Perform EDA, generating visualizations for time series, correlations, wind, and relationships.

    Args:
        df (pd.DataFrame): Cleaned dataset.
        output_dir (str): Directory to save plots.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Time Series Plot
    plt.figure(figsize=(12, 6))
    for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
        if col in df.columns:
            plt.plot(df['Timestamp'], df[col], label=col)
    plt.title('Time Series of Irradiance and Temperature')
    plt.xlabel('Timestamp')
    plt.ylabel('Value')
    plt.legend()
    plt.savefig(f'{output_dir}/time_series.png')
    plt.close()
    
    # Correlation Heatmap
    corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
    if all(col in df.columns for col in corr_cols):
        plt.figure(figsize=(8, 6))
        sns.heatmap(df[corr_cols].corr(), annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.savefig(f'{output_dir}/correlation_heatmap.png')
        plt.close()
    
    # Wind Rose
    if 'WS' in df.columns and 'WD' in df.columns:
        fig = plt.figure(figsize=(8, 8))
        ax = WindroseAxes.from_ax(fig=fig)
        ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
        ax.set_legend()
        plt.title('Wind Rose Plot')
        plt.savefig(f'{output_dir}/wind_rose.png')
        plt.close()
    
    # Bubble Chart (GHI vs Tamb, sized by RH)
    if all(col in df.columns for col in ['GHI', 'Tamb', 'RH']):
        plt.figure(figsize=(10, 6))
        plt.scatter(df['Tamb'], df['GHI'], s=df['RH']*10, c=df['RH'], cmap='viridis', alpha=0.5)
        plt.colorbar(label='Relative Humidity (%)')
        plt.xlabel('Ambient Temperature (°C)')
        plt.ylabel('GHI (W/m²)')
        plt.title('GHI vs Temperature (Bubble Size/Color by RH)')
        plt.savefig(f'{output_dir}/bubble_chart.png')
        plt.close()

TIme series Analysis

In [27]:
def main():
    """Execute the EDA pipeline for the configured dataset."""
    try:
        # Load data
        df = load_data(CONFIG['raw_data_file'])
        
        # Profile data
        profile = profile_data(df)
        print("Profiling Results:")
        print("Statistics:\n", profile['statistics'])
        print("Missing Values:\n", profile['missing_counts'])
        print("Negative Counts:\n", profile['negative_counts'])
        print("Outlier Counts:\n", profile['outlier_counts'])
        
        # Clean data
        df_clean = clean_data(df)
        
        # Save cleaned data
        os.makedirs(os.path.dirname(CONFIG['cleaned_data_file']), exist_ok=True)
        df_clean.to_csv(CONFIG['cleaned_data_file'], index=False)
        
        # Perform EDA
        perform_eda(df_clean, 'data/plots')
        
        print("EDA completed. Outputs saved in data/ directory.")
    
    except Exception as e:
        print(f"Error: {str(e)}")

# Run the pipeline
main()

Profiling Results:
Statistics:
                            Timestamp            GHI            DNI  \
count                         525600  525600.000000  525600.000000   
mean   2022-04-25 12:00:30.000000768     230.555040     151.258469   
min              2021-10-25 00:01:00     -12.700000       0.000000   
25%              2022-01-24 06:00:45      -2.200000       0.000000   
50%              2022-04-25 12:00:30       2.100000       0.000000   
75%              2022-07-25 18:00:15     442.400000     246.400000   
max              2022-10-25 00:00:00    1424.000000    1004.500000   
std                              NaN     322.532347     250.956962   

                 DHI           ModA           ModB           Tamb  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      116.444352     226.144375     219.568588      27.751788   
min         0.000000       0.000000       0.000000      14.900000   
25%         0.000000       0.000000       0.000000      24.20