In [2]:
# Cell 1: Notebook Introduction and Setup
"""
EnergyNexus Data Quality Assessment Notebook
Aditya's MSc Project - Comprehensive Data Quality Analysis and Validation

NOTEBOOK PURPOSE:
This notebook performs comprehensive data quality assessment to ensure the reliability
and accuracy of my energy forecasting system. Data quality is fundamental because:

1. Poor data quality directly degrades forecasting model performance
2. Energy system reliability depends on accurate and consistent data
3. Anomalies and outliers can indicate equipment failures or data collection issues
4. Data gaps and inconsistencies affect model training and validation
5. Quality metrics inform data preprocessing and cleaning strategies

MY DATA QUALITY ASSESSMENT STRATEGY:
1. Comprehensive missing data analysis and pattern identification
2. Outlier detection using multiple statistical and domain-specific methods
3. Data consistency validation across temporal and logical constraints
4. Data integrity checks for energy balance and physical feasibility
5. Temporal continuity assessment for time series modeling requirements
6. Data completeness evaluation for different modeling scenarios

WHY DATA QUALITY IS CRITICAL FOR MY PROJECT:
- LSTM models require consistent, high-quality time series data
- Energy optimization depends on accurate generation and demand data
- Forecasting accuracy is directly proportional to input data quality
- System reliability analysis requires comprehensive historical data
- Operational decisions based on poor data can lead to grid instability

Author: Aditya Talekar (ec24018@qmul.ac.uk)
Supervisor: Saqib Iqbal
QMUL MSc Data Science and AI - 2024/25
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
import warnings
from datetime import datetime, timedelta
import sys
import os

# I suppress warnings for clean thesis documentation output
warnings.filterwarnings('ignore')

# I configure plotting parameters for publication-quality data quality figures
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set1")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# I add source directory for importing custom data quality modules
sys.path.append(os.path.join('..', '..', 'src'))

print("EnergyNexus Comprehensive Data Quality Assessment")
print("=" * 55)
print(f"Assessment started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Objective: Validate data quality for energy forecasting and optimization")


# Cell 2: Data Loading and Initial Quality Overview
"""
I load the energy dataset and perform initial quality assessment to understand
the scope and nature of potential data quality issues.

INITIAL QUALITY ASSESSMENT:
- Load complete dataset and validate structure
- Calculate basic quality metrics across all variables
- Identify critical quality issues requiring immediate attention
- Establish baseline quality scores for improvement tracking
"""

def load_energy_data():
    """Load real energy data from APIs or fallback to sample data"""
    try:
        import requests
        import yaml
        
        # Try to load API configuration
        config_path = '../../config/config.yaml'
        if os.path.exists(config_path):
            with open(config_path) as f:
                config = yaml.safe_load(f)
        else:
            raise FileNotFoundError("Config file not found")
        
        # 1. Load EIA energy data
        print("Fetching real energy data from EIA API...")
        eia_url = "https://api.eia.gov/v2/electricity/rto/region-data/data/"
        params = {
            'api_key': config['eia']['api_key'],
            'frequency': 'hourly',
            'data': ['value'],
            'facets': {'type': ['D']},  # Demand data
            'start': '2024-01-01',
            'end': '2024-03-01',
            'sort': [{"column": "period", "direction": "desc"}]
        }
        response = requests.get(eia_url, params=params)
        response.raise_for_status()  # Raise error for bad status
        eia_data = response.json()['response']['data']
        
        # Convert to DataFrame
        energy_df = pd.DataFrame(eia_data)
        energy_df['timestamp'] = pd.to_datetime(energy_df['period'])
        energy_df.set_index('timestamp', inplace=True)
        energy_df.rename(columns={'value': 'energy_demand'}, inplace=True)
        
        # 2. Load Open-Meteo weather data
        print("Fetching weather data from Open-Meteo API...")
        weather_url = "https://archive-api.open-meteo.com/v1/archive"
        weather_params = {
            'latitude': 51.5074,  # London coordinates
            'longitude': -0.1278,
            'start_date': energy_df.index.min().strftime('%Y-%m-%d'),
            'end_date': energy_df.index.max().strftime('%Y-%m-%d'),
            'hourly': 'temperature_2m,relative_humidity_2m,wind_speed_10m'
        }
        weather_response = requests.get(weather_url, params=weather_params)
        weather_response.raise_for_status()
        weather_data = weather_response.json()['hourly']
        
        # Convert to DataFrame
        weather_df = pd.DataFrame(weather_data)
        weather_df['timestamp'] = pd.to_datetime(weather_df['time'])
        weather_df.set_index('timestamp', inplace=True)
        
        # 3. Merge datasets
        combined_data = pd.merge(energy_df, weather_df, left_index=True, right_index=True, how='inner')
        
        # 4. Add simulated renewables and price
        print("Generating renewable and price features...")
        # Solar generation (simulated based on time and weather)
        hour_sin = np.sin((combined_data.index.hour - 12) * np.pi/12)
        combined_data['solar_generation'] = np.maximum(0, hour_sin * 200 * (1 - combined_data['temperature_2m']/40))
        
        # Wind generation (based on wind speed)
        combined_data['wind_generation'] = combined_data['wind_speed_10m'] * 15
        
        # Price simulation
        demand = combined_data['energy_demand'].values  # Convert to numpy array
        solar = combined_data['solar_generation'].values
        wind = combined_data['wind_generation'].values
        
        base_price = 50
        demand_factor = (demand - np.mean(demand)) / np.std(demand) * 10
        renewable_factor = -((solar + wind) - np.mean(solar + wind)) / np.std(solar + wind) * 8
        combined_data['energy_price'] = base_price + demand_factor + renewable_factor + np.random.normal(0, 5, len(demand))
        
        # Add time features
        combined_data['hour'] = combined_data.index.hour
        combined_data['day_of_week'] = combined_data.index.dayofweek
        combined_data['month'] = combined_data.index.month
        combined_data['is_weekend'] = combined_data['day_of_week'] >= 5
        combined_data['is_business_hour'] = (combined_data['hour'] >= 8) & (combined_data['hour'] <= 18)
        combined_data['total_renewable'] = combined_data['solar_generation'] + combined_data['wind_generation']
        
        print("Successfully loaded real energy and weather data via APIs")
        return combined_data, "Real API data"
    
    except Exception as e:
        print(f"API loading failed: {str(e)}")
        return None, None

# I attempt to load processed energy data, creating comprehensive sample if needed
try:
    # First try to load from the processed directory
    processed_file = '../../data/processed/test_cleaned_energy_data.csv'
    if os.path.exists(processed_file):
        energy_data = pd.read_csv(processed_file, parse_dates=[0], index_col=0)
        print("Successfully loaded processed energy dataset")
        data_source = "Processed pipeline data"
    else:
        raise FileNotFoundError("Processed file not found")
        
except FileNotFoundError:
    # Try loading from API
    api_result, api_source = load_energy_data()
    
    if api_result is not None:
        energy_data = api_result
        data_source = api_source
    else:
        print("Creating comprehensive sample energy data with quality issues for assessment...")
        
        # I generate realistic energy data with intentional quality issues for testing
        np.random.seed(42)
        hours = 24 * 50  # 50 days for comprehensive quality assessment
        dates = pd.date_range(start='2024-01-01', periods=hours, freq='H')
        
        # I create base energy system data
        time_hours = np.arange(hours)
        
        # Base temperature pattern
        temperature = 15 + 10 * np.sin(2 * np.pi * dates.dayofyear / 365) + \
                      6 * np.sin((time_hours % 24 - 14) * 2 * np.pi / 24) + \
                      np.random.normal(0, 3, hours)
        
        # Solar generation with realistic patterns
        solar_elevation = np.maximum(0, np.sin((time_hours % 24 - 12) * np.pi / 12))
        cloud_factor = np.random.uniform(0.6, 1.0, hours)
        solar_generation = solar_elevation * cloud_factor * 180 + np.random.normal(0, 12, hours)
        solar_generation = np.maximum(0, solar_generation)
        
        # Wind generation with persistence
        wind_base = 80 + 30 * np.sin(2 * np.pi * time_hours / (24 * 8))
        wind_noise = np.random.normal(0, 25, hours)
        wind_generation = wind_base + wind_noise
        wind_generation = np.maximum(10, wind_generation)
        
        # Energy demand with multiple components
        demand_base = 450
        daily_pattern = 200 * np.maximum(0, np.sin((time_hours % 24 - 6) * np.pi / 12))
        weekly_pattern = 80 * np.sin((time_hours % (24*7)) * 2 * np.pi / (24*7))
        heating_demand = np.maximum(0, (15 - temperature) * 12)
        cooling_demand = np.maximum(0, (temperature - 22) * 15)
        energy_demand = (demand_base + daily_pattern + weekly_pattern + 
                        heating_demand + cooling_demand + np.random.normal(0, 30, hours))
        energy_demand = np.maximum(250, energy_demand)
        
        # Natural gas generation following demand
        natural_gas_generation = np.maximum(0, energy_demand - solar_generation - wind_generation + 
                                           np.random.normal(0, 20, hours))
        
        # Grid frequency based on supply-demand balance
        supply_total = solar_generation + wind_generation + natural_gas_generation
        grid_frequency = 50.0 + (supply_total - energy_demand) * 0.0005 + np.random.normal(0, 0.02, hours)
        grid_frequency = np.clip(grid_frequency, 49.8, 50.2)
        
        # Energy price based on demand and supply - FIXED VERSION
        base_price = 50
        demand_factor = (energy_demand - np.mean(energy_demand)) / np.std(energy_demand) * 8
        renewable_factor = -((solar_generation + wind_generation) - np.mean(solar_generation + wind_generation)) / np.std(solar_generation + wind_generation) * 5
        energy_price = base_price + demand_factor + renewable_factor + np.random.normal(0, 3, hours)
        energy_price = np.maximum(0, energy_price)
        
        # I create the initial clean dataset
        energy_data = pd.DataFrame({
            'energy_demand': energy_demand,
            'solar_generation': solar_generation,
            'wind_generation': wind_generation,
            'natural_gas_generation': natural_gas_generation,
            'total_renewable': solar_generation + wind_generation,
            'total_generation': solar_generation + wind_generation + natural_gas_generation,
            'temperature': temperature,
            'grid_frequency': grid_frequency,
            'energy_price': energy_price,
            'hour': dates.hour,
            'day_of_week': dates.dayofweek,
            'month': dates.month,
            'is_weekend': dates.dayofweek >= 5
        }, index=dates)
        
        # I intentionally introduce various data quality issues for testing
        print("Introducing realistic data quality issues for assessment...")
        
        # 1. Missing data patterns (equipment failures, communication issues)
        # Random missing values (sensor failures)
        missing_indices_random = np.random.choice(energy_data.index, size=int(len(energy_data) * 0.02), replace=False)
        energy_data.loc[missing_indices_random, 'solar_generation'] = np.nan
        
        # Consecutive missing periods (maintenance windows)
        maintenance_start = np.random.choice(energy_data.index[:-48], size=3)
        for start_idx in maintenance_start:
            start_loc = energy_data.index.get_loc(start_idx)
            end_loc = min(start_loc + 24, len(energy_data) - 1)  # 24-hour maintenance
            energy_data.iloc[start_loc:end_loc, energy_data.columns.get_loc('wind_generation')] = np.nan
        
        # Weather station outages
        weather_outage_indices = np.random.choice(energy_data.index, size=int(len(energy_data) * 0.015), replace=False)
        energy_data.loc[weather_outage_indices, 'temperature'] = np.nan
        
        # 2. Outliers and anomalies
        # Extreme solar spikes (sensor malfunctions)
        spike_indices = np.random.choice(energy_data.index[energy_data['solar_generation'] > 0], size=5)
        energy_data.loc[spike_indices, 'solar_generation'] *= 3  # 300% spikes
        
        # Negative values (sensor calibration errors)
        negative_indices = np.random.choice(energy_data.index, size=8)
        energy_data.loc[negative_indices, 'wind_generation'] = -np.random.uniform(10, 50, len(negative_indices))
        
        # Extreme demand spikes (data transmission errors)
        demand_spike_indices = np.random.choice(energy_data.index, size=3)
        energy_data.loc[demand_spike_indices, 'energy_demand'] *= 2.5
        
        # Frequency outliers (grid events)
        freq_outlier_indices = np.random.choice(energy_data.index, size=6)
        energy_data.loc[freq_outlier_indices, 'grid_frequency'] = np.random.choice([49.5, 50.5], len(freq_outlier_indices))
        
        # 3. Data inconsistencies
        # Total generation not matching sum of components
        inconsistent_indices = np.random.choice(energy_data.index, size=12)
        energy_data.loc[inconsistent_indices, 'total_generation'] *= 0.7  # 30% underreporting
        
        # Impossible combinations (generation without solar irradiance)
        night_indices = energy_data[energy_data['hour'].isin([0, 1, 2, 3, 4, 5, 22, 23])].index
        impossible_solar_indices = np.random.choice(night_indices, size=4)
        energy_data.loc[impossible_solar_indices, 'solar_generation'] = np.random.uniform(50, 100, len(impossible_solar_indices))
        
        # 4. Timestamp issues
        # Duplicate timestamps (create a few duplicated rows)
        duplicate_indices = np.random.choice(energy_data.index, size=3)
        for dup_idx in duplicate_indices:
            duplicate_row = energy_data.loc[dup_idx].copy()
            energy_data = pd.concat([energy_data, duplicate_row.to_frame().T])
        
        # I sort to handle duplicates properly
        energy_data = energy_data.sort_index()
        
        data_source = "Generated sample data with intentional quality issues"

print(f"Data source: {data_source}")
print(f"Dataset shape: {energy_data.shape}")
print(f"Assessment period: {energy_data.index.min()} to {energy_data.index.max()}")

# I identify energy variables for quality assessment
energy_variables = [col for col in energy_data.columns 
                   if any(keyword in col.lower() for keyword in 
                         ['generation', 'demand', 'renewable', 'frequency', 'price', 'temperature'])]

print(f"Energy variables for quality assessment: {len(energy_variables)}")
print(f"Variables: {energy_variables}")

# I calculate initial quality overview
print(f"\nINITIAL DATA QUALITY OVERVIEW:")
print("=" * 40)

total_cells = energy_data.shape[0] * energy_data.shape[1]
missing_cells = energy_data.isnull().sum().sum()
duplicate_rows = energy_data.index.duplicated().sum()

print(f"Dataset size: {energy_data.shape[0]:,} rows × {energy_data.shape[1]} columns")
print(f"Total data cells: {total_cells:,}")
print(f"Missing cells: {missing_cells:,} ({missing_cells/total_cells*100:.1f}%)")
print(f"Duplicate timestamps: {duplicate_rows}")

# I calculate variable-specific quality metrics
variable_quality_summary = pd.DataFrame(index=energy_variables)

for var in energy_variables:
    if var in energy_data.columns:
        data_series = energy_data[var]
        
        # Basic quality metrics
        variable_quality_summary.loc[var, 'Missing_Count'] = data_series.isnull().sum()
        variable_quality_summary.loc[var, 'Missing_Percent'] = (data_series.isnull().sum() / len(data_series)) * 100
        variable_quality_summary.loc[var, 'Non_Missing_Count'] = data_series.count()
        
        # Data range and distribution
        if data_series.count() > 0:
            variable_quality_summary.loc[var, 'Min_Value'] = data_series.min()
            variable_quality_summary.loc[var, 'Max_Value'] = data_series.max()
            variable_quality_summary.loc[var, 'Mean_Value'] = data_series.mean()
            variable_quality_summary.loc[var, 'Std_Value'] = data_series.std()
            
            # Outlier detection using IQR method
            Q1 = data_series.quantile(0.25)
            Q3 = data_series.quantile(0.75)
            IQR = Q3 - Q1
            outlier_count = ((data_series < (Q1 - 1.5 * IQR)) | (data_series > (Q3 + 1.5 * IQR))).sum()
            variable_quality_summary.loc[var, 'Outlier_Count'] = outlier_count
            variable_quality_summary.loc[var, 'Outlier_Percent'] = (outlier_count / data_series.count()) * 100

print(f"\nVariable Quality Summary:")
print(variable_quality_summary.round(2))

# I calculate overall quality score
overall_quality_metrics = {
    'completeness_score': (1 - missing_cells / total_cells) * 100,
    'consistency_score': (1 - duplicate_rows / len(energy_data)) * 100,
    'average_outlier_rate': variable_quality_summary['Outlier_Percent'].mean()
}

overall_quality_score = (overall_quality_metrics['completeness_score'] * 0.4 + 
                         overall_quality_metrics['consistency_score'] * 0.3 + 
                         (100 - overall_quality_metrics['average_outlier_rate']) * 0.3)

print(f"\nOVERALL QUALITY ASSESSMENT:")
print(f"  Data Completeness: {overall_quality_metrics['completeness_score']:.1f}%")
print(f"  Data Consistency: {overall_quality_metrics['consistency_score']:.1f}%")
print(f"  Average Outlier Rate: {overall_quality_metrics['average_outlier_rate']:.1f}%")
print(f"  Overall Quality Score: {overall_quality_score:.1f}%")

if overall_quality_score >= 90:
    quality_assessment = "Excellent - Suitable for advanced modeling"
elif overall_quality_score >= 80:
    quality_assessment = "Good - Minor preprocessing needed"
elif overall_quality_score >= 70:
    quality_assessment = "Acceptable - Moderate preprocessing required"
else:
    quality_assessment = "Poor - Significant preprocessing required"

print(f"  Quality Assessment: {quality_assessment}")

# Cell 3: Create necessary directories
"""
I ensure required directories exist for saving plots and results
"""
import os

# Create directories if they don't exist
results_dir = '../../results'
plots_dir = '../../results/plots'

for directory in [results_dir, plots_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

print("Directory structure verified and created if needed.")

EnergyNexus Comprehensive Data Quality Assessment
Assessment started: 2025-07-05 00:09:01
Objective: Validate data quality for energy forecasting and optimization
API loading failed: Config file not found
Creating comprehensive sample energy data with quality issues for assessment...
Introducing realistic data quality issues for assessment...
Data source: Generated sample data with intentional quality issues
Dataset shape: (1203, 13)
Assessment period: 2024-01-01 00:00:00 to 2024-02-19 23:00:00
Energy variables for quality assessment: 9
Variables: ['energy_demand', 'solar_generation', 'wind_generation', 'natural_gas_generation', 'total_renewable', 'total_generation', 'temperature', 'grid_frequency', 'energy_price']

INITIAL DATA QUALITY OVERVIEW:
Dataset size: 1,203 rows × 13 columns
Total data cells: 15,639
Missing cells: 115 (0.7%)
Duplicate timestamps: 3

Variable Quality Summary:
                        Missing_Count  Missing_Percent  Non_Missing_Count  \
energy_demand             