In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
# Function to clean data before analysis
def clean_data(data):
    """
    Cleans solar radiation data by handling missing values, duplicates, and invalid data.
    """
    # Convert infinite values to NaN and coerce invalid timestamps to NaN
    data = data.replace([np.inf, -np.inf], np.nan)
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce')

    # Impute missing numeric values with column mean
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data[numeric_columns] = data[numeric_columns].apply(lambda x: x.fillna(x.mean()))

    # Remove duplicates
    data.drop_duplicates(inplace=True)

    # Handle entirely null columns, like 'Comments'
    null_columns = data.columns[data.isnull().sum() == len(data)]
    data.drop(columns=null_columns, inplace=True)

    return data

In [None]:
def perform_eda(data, location_name):
    """
    Performs exploratory data analysis on solar radiation data for a specific location.
    """
    # Clean the data
    data = clean_data(data)

    if data.empty:
        print(f"No data available after cleaning for {location_name}.")
        return

    # Set 'Timestamp' as the index for time-series analysis
    data.set_index('Timestamp', inplace=True)

    # 1. Summary Statistics
    summary_stats = data.describe()
    print(f"\nSummary statistics for {location_name}:")
    print(summary_stats)

    # 2. Data Quality Check
    missing_values = data.isnull().sum()
    print(f"\nMissing values for {location_name}:")
    print(missing_values)
    # 3. Time Series Analysis
    variables = ['GHI', 'DNI', 'DHI', 'Tamb']
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=data[variables])
    plt.title(f"Time-series trends for {location_name}")
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.show()

    # 4. Correlation Analysis
    correlation_matrix = data.corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f"Correlation Matrix for {location_name}")
    plt.show()

    # 5. Wind Analysis
    wind_variables = ['WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev']
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=data[wind_variables])
    plt.title(f"Wind Speed and Direction for {location_name}")
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.show()

    # 6. Temperature Analysis
    temp_variables = ['TModA', 'TModB', 'Tamb']
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=data[temp_variables])
    plt.title(f"Module and Ambient Temperatures for {location_name}")
    plt.xlabel("Time")
    plt.ylabel("Temperature (°C)")
    plt.show()

    # 7. Histograms
    histogram_variables = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']
    for var in histogram_variables:
        plt.figure(figsize=(12, 6))
        sns.histplot(data[var], kde=True)
        plt.title(f"Histogram of {var} for {location_name}")
        plt.xlabel(var)
        plt.ylabel("Count")
        plt.show()

    # 8. Box Plots
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data[variables])
    plt.title(f"Box Plot for {location_name}")
    plt.show()

     # 9. Scatter Plots
    scatter_pairs = [('GHI', 'Tamb'), ('WS', 'WSgust')]
    for x, y in scatter_pairs:
        plt.figure(figsize=(12, 6))
        sns.scatterplot(data=data, x=x, y=y, hue='Precipitation', palette='viridis', s=20)
        plt.title(f"Scatter Plot of {x} vs {y} for {location_name}")
        plt.xlabel(x)
        plt.ylabel(y)
        plt.show()

In [None]:
# Load data from CSV files
try:
    benin_data = pd.read_csv("benin-malanville.csv")
    sierra_leone_data = pd.read_csv("sierraleone-bumbuna.csv")
    togo_data = pd.read_csv("togo-dapaong_qc.csv")
except FileNotFoundError:
    print("Error: One or more CSV files not found.")
    exit()

In [None]:
# Apply the analysis on each dataset
process_and_analyze(benin_data, "Benin - Malanville")