In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load datasets
data = pd.read_csv('../files/benin-malanville.csv')
data2 = pd.read_csv('../files/sierraleone-bumbuna.csv')
data3 = pd.read_csv('../files/togo-dapaong_qc.csv')

# Exclude 'Comments' column if it exists
data = data.drop(columns=['Comments'], errors='ignore')
data2 = data2.drop(columns=['Comments'], errors='ignore')
data3 = data3.drop(columns=['Comments'], errors='ignore')

# List of columns to calculate statistics for
columns = ['RH', 'WS', 'GHI', 'DNI', 'DHI']  # Replace with your column names
statistics = {}
for col in columns:
    if col in data.columns:
        statistics[col] = {
            'Mean': data[col].mean(),
            'Median': data[col].median(),
            'Variance': data[col].var(),
            'Skewness': data[col].skew(),
            'Kurtosis': data[col].kurt()
        }

stats_df = pd.DataFrame(statistics).transpose()
print("Summary Statistics:\n", stats_df)

# Function to create a summary of missing values for a given dataset
def summarize_missing_values(dataset, file_name):
    missing_values = dataset.isnull().sum()
    missing_data = pd.DataFrame({
        'File': file_name,
        'Column': missing_values.index,
        'Missing Values Count': missing_values.values
    })
    # Filter out columns with no missing values
    missing_data = missing_data[missing_data['Missing Values Count'] > 0]
    return missing_data

# Summarize missing values for each dataset
missing_benin = summarize_missing_values(data, "benin-malanville.csv")
missing_sierraleone = summarize_missing_values(data2, "sierraleone-bumbuna.csv")
missing_togo = summarize_missing_values(data3, "togo-dapaong_qc.csv")

# Combine all summaries into a single DataFrame
combined_missing_data = pd.concat([missing_benin, missing_sierraleone, missing_togo], ignore_index=True)

# Display the combined missing values summary
print("\nColumns with Missing Values (File & Count):\n")
print(combined_missing_data[['File', 'Column', 'Missing Values Count']].to_string(index=False))

# ----------------------------------------
# Time Series Analysis
# ----------------------------------------

# Convert 'Time' or similar column to datetime for time-based analysis
if 'Time' in data.columns:
    data['Time'] = pd.to_datetime(data['Time'])
    data.set_index('Time', inplace=True)

# Plot GHI, DNI, DHI, and Tamb over time
time_series_columns = ['GHI', 'DNI', 'DHI', 'Tamb']
plt.figure(figsize=(12, 8))
for col in time_series_columns:
    if col in data.columns:
        plt.plot(data.index, data[col], label=col)

plt.title("Time Series of GHI, DNI, DHI, and Tamb")
plt.xlabel("Time")
plt.ylabel("Value")
plt.legend()
plt.grid()
plt.show()

# Evaluate the impact of cleaning on sensor readings (ModA, ModB)
if 'Cleaning' in data.columns:
    cleaned_data = data[data['Cleaning'] == 1]
    uncleaned_data = data[data['Cleaning'] == 0]

    plt.figure(figsize=(12, 6))
    for mod in ['ModA', 'ModB']:
        if mod in data.columns:
            plt.plot(cleaned_data.index, cleaned_data[mod], label=f"{mod} (Cleaned)")
            plt.plot(uncleaned_data.index, uncleaned_data[mod], linestyle='--', label=f"{mod} (Uncleaned)")

    plt.title("Impact of Cleaning on Sensor Readings (ModA and ModB)")
    plt.xlabel("Time")
    plt.ylabel("Sensor Reading")
    plt.legend()
    plt.grid()
    plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '../files/benin-malanville.csv'

# New Section