# Exploratory Data Analysis - Energy Datasets

**Project:** AI Final Year Project  
**Date:** December 2025  
**Objective:** Initial exploration of energy datasets to understand their characteristics and suitability for showcasing ML algorithms

## Datasets to Analyze:
1. `ENB2012_data.xlsx` - Energy Efficiency Dataset
2. `energydata_complete.csv` - Appliance Energy Prediction Dataset

## Goals:
- Understand dataset sizes and structure
- Check data quality (missing values, types)
- Identify suitable algorithms for each dataset
- Determine which datasets should be in Git vs downloaded separately

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Check Dataset Sizes

First, let's check the file sizes to understand which datasets are large and should be handled separately from Git.

In [None]:
# Define dataset paths
dataset_dir = Path('../datasets')
datasets = [
    'ENB2012_data.xlsx',
    'energydata_complete.csv'
]

# Check file sizes
print("=" * 70)
print("DATASET SIZE ANALYSIS")
print("=" * 70)

dataset_info = []
for dataset in datasets:
    filepath = dataset_dir / dataset
    if filepath.exists():
        size_bytes = filepath.stat().st_size
        size_mb = size_bytes / (1024 * 1024)
        git_track = "YES (< 10MB)" if size_mb < 10 else "NO (> 10MB) - Download separately"
        
        dataset_info.append({
            'Dataset': dataset,
            'Size (MB)': round(size_mb, 2),
            'Track in Git': git_track
        })
        
        print(f"\nüìä {dataset}")
        print(f"   Size: {size_mb:.2f} MB")
        print(f"   Git Tracking: {git_track}")
    else:
        print(f"\n‚ùå {dataset} - FILE NOT FOUND")
        dataset_info.append({
            'Dataset': dataset,
            'Size (MB)': 'N/A',
            'Track in Git': 'File not found'
        })

print("\n" + "=" * 70)

# Create summary DataFrame
df_sizes = pd.DataFrame(dataset_info)
print("\nüìã SUMMARY TABLE:")
print(df_sizes.to_string(index=False))

## 3. Load and Inspect Dataset 1: ENB2012_data.xlsx

Energy Efficiency Dataset - Building characteristics and energy consumption

In [None]:
# Load Dataset 1
try:
    df1 = pd.read_excel(dataset_dir / 'ENB2012_data.xlsx')
    
    print("=" * 70)
    print("DATASET 1: ENB2012_data.xlsx - Energy Efficiency")
    print("=" * 70)
    
    print(f"\nüìê Shape: {df1.shape[0]} rows √ó {df1.shape[1]} columns")
    print(f"\nüìã Columns:")
    for i, col in enumerate(df1.columns, 1):
        print(f"   {i}. {col}")
    
    print(f"\nüîç First 5 rows:")
    display(df1.head())
    
    print(f"\nüìä Data Types:")
    display(df1.dtypes)
    
    print(f"\nüíæ Memory Usage:")
    print(df1.memory_usage(deep=True).sum() / 1024, "KB")
    
except FileNotFoundError:
    print("‚ùå ENB2012_data.xlsx not found in datasets folder")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")

In [None]:
# Dataset 1: Basic Statistics
if 'df1' in locals():
    print("=" * 70)
    print("DATASET 1: BASIC STATISTICS")
    print("=" * 70)
    display(df1.describe())
    
    print("\n‚úÖ Missing Values:")
    missing = df1.isnull().sum()
    if missing.sum() == 0:
        print("   No missing values found!")
    else:
        display(missing[missing > 0])

## 4. Load and Inspect Dataset 2: energydata_complete.csv

Appliance Energy Prediction Dataset - Time-series energy consumption data

In [None]:
# Load Dataset 2
try:
    df2 = pd.read_csv(dataset_dir / 'energydata_complete.csv')
    
    print("=" * 70)
    print("DATASET 2: energydata_complete.csv - Appliance Energy Prediction")
    print("=" * 70)
    
    print(f"\nüìê Shape: {df2.shape[0]} rows √ó {df2.shape[1]} columns")
    print(f"\nüìã Columns:")
    for i, col in enumerate(df2.columns, 1):
        print(f"   {i}. {col}")
    
    print(f"\nüîç First 5 rows:")
    display(df2.head())
    
    print(f"\nüîç Last 5 rows:")
    display(df2.tail())
    
    print(f"\nüìä Data Types:")
    display(df2.dtypes)
    
    print(f"\nüíæ Memory Usage:")
    print(f"{df2.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
    
except FileNotFoundError:
    print("‚ùå energydata_complete.csv not found in datasets folder")
    print("   This file should be downloaded separately (see DATA_SOURCES.md)")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")

In [None]:
# Dataset 2: Basic Statistics
if 'df2' in locals():
    print("=" * 70)
    print("DATASET 2: BASIC STATISTICS")
    print("=" * 70)
    display(df2.describe())
    
    print("\n‚úÖ Missing Values:")
    missing = df2.isnull().sum()
    if missing.sum() == 0:
        print("   No missing values found!")
    else:
        print(f"   Total missing values: {missing.sum()}")
        display(missing[missing > 0])

## 5. Compare Dataset Characteristics

In [None]:
# Compare datasets
comparison_data = []

if 'df1' in locals():
    comparison_data.append({
        'Dataset': 'ENB2012_data.xlsx',
        'Rows': df1.shape[0],
        'Columns': df1.shape[1],
        'Numeric Cols': len(df1.select_dtypes(include=[np.number]).columns),
        'Categorical Cols': len(df1.select_dtypes(exclude=[np.number]).columns),
        'Missing Values': df1.isnull().sum().sum(),
        'Memory (MB)': round(df1.memory_usage(deep=True).sum() / (1024*1024), 2)
    })

if 'df2' in locals():
    comparison_data.append({
        'Dataset': 'energydata_complete.csv',
        'Rows': df2.shape[0],
        'Columns': df2.shape[1],
        'Numeric Cols': len(df2.select_dtypes(include=[np.number]).columns),
        'Categorical Cols': len(df2.select_dtypes(exclude=[np.number]).columns),
        'Missing Values': df2.isnull().sum().sum(),
        'Memory (MB)': round(df2.memory_usage(deep=True).sum() / (1024*1024), 2)
    })

if comparison_data:
    df_comparison = pd.DataFrame(comparison_data)
    print("=" * 90)
    print("DATASET COMPARISON")
    print("=" * 90)
    display(df_comparison)
    
    # Visualize comparison
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: Number of rows
    axes[0].bar(df_comparison['Dataset'], df_comparison['Rows'], color=['#3498db', '#e74c3c'])
    axes[0].set_title('Dataset Size (Number of Rows)', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Number of Rows')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Plot 2: Number of columns
    axes[1].bar(df_comparison['Dataset'], df_comparison['Columns'], color=['#2ecc71', '#f39c12'])
    axes[1].set_title('Dataset Dimensions (Number of Columns)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Number of Columns')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## 6. Correlation Analysis - Dataset 1

Understanding relationships between features helps us choose appropriate ML algorithms

In [None]:
# Correlation matrix for Dataset 1
if 'df1' in locals():
    print("CORRELATION ANALYSIS - Dataset 1")
    print("=" * 70)
    
    # Select only numeric columns
    numeric_cols = df1.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        # Calculate correlation
        corr_matrix = df1[numeric_cols].corr()
        
        # Plot correlation heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                    center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
        plt.title('Correlation Matrix - ENB2012 Energy Efficiency Dataset', 
                  fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
        
        # Find highly correlated features (> 0.7 or < -0.7)
        print("\nüî• Highly Correlated Feature Pairs (|correlation| > 0.7):")
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > 0.7:
                    high_corr.append({
                        'Feature 1': corr_matrix.columns[i],
                        'Feature 2': corr_matrix.columns[j],
                        'Correlation': round(corr_matrix.iloc[i, j], 3)
                    })
        
        if high_corr:
            display(pd.DataFrame(high_corr))
        else:
            print("   No highly correlated pairs found.")
    else:
        print("No numeric columns found for correlation analysis.")

## 7. Correlation Analysis - Dataset 2

In [None]:
# Correlation matrix for Dataset 2
if 'df2' in locals():
    print("CORRELATION ANALYSIS - Dataset 2")
    print("=" * 70)
    
    # Select only numeric columns
    numeric_cols = df2.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        # For large datasets, show top correlations with target variable if it exists
        # Assuming 'Appliances' or similar is the target
        
        # Calculate correlation
        corr_matrix = df2[numeric_cols].corr()
        
        # Plot correlation heatmap (might be large, so use smaller figure for many features)
        if len(numeric_cols) > 15:
            fig_size = (16, 14)
            annot_size = 6
        else:
            fig_size = (12, 10)
            annot_size = 8
            
        plt.figure(figsize=fig_size)
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                    center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
                    annot_kws={'size': annot_size})
        plt.title('Correlation Matrix - Appliance Energy Prediction Dataset', 
                  fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
        
        # Show top correlations if there's a clear target variable
        print("\nüìä Correlation with potential target variables:")
        for col in numeric_cols[:5]:  # Check first 5 columns
            if 'energy' in col.lower() or 'appliance' in col.lower() or 'load' in col.lower():
                print(f"\nTop correlations with '{col}':")
                corr_with_target = corr_matrix[col].abs().sort_values(ascending=False)[1:11]
                display(corr_with_target)
    else:
        print("No numeric columns found for correlation analysis.")

## 8. Initial Insights and ML Algorithm Suitability

Based on the exploratory analysis, let's determine which algorithms are suitable for each dataset.

In [None]:
print("=" * 90)
print("INITIAL INSIGHTS & ML ALGORITHM RECOMMENDATIONS")
print("=" * 90)

insights = []

if 'df1' in locals():
    print("\nüìä DATASET 1: ENB2012_data.xlsx")
    print("-" * 90)
    print("‚úÖ Suitable for:")
    print("   ‚Ä¢ LINEAR REGRESSION - Predict heating/cooling loads from building features")
    print("   ‚Ä¢ POLYNOMIAL REGRESSION - Capture non-linear relationships")
    print("   ‚Ä¢ DECISION TREES - Handle feature interactions")
    print("   ‚Ä¢ RANDOM FOREST - Improve prediction accuracy")
    print("   ‚Ä¢ NEURAL NETWORKS (PyTorch) - Deep learning approach")
    print("   ‚Ä¢ CLUSTERING (K-means) - Group similar buildings")
    print("\nüí° Insights:")
    print(f"   ‚Ä¢ {df1.shape[0]} building samples")
    print(f"   ‚Ä¢ {df1.shape[1]} features")
    print(f"   ‚Ä¢ Clean data - no missing values")
    print("   ‚Ä¢ All numeric features - ready for ML")
    print("   ‚Ä¢ Multiple targets possible (heating and cooling loads)")

if 'df2' in locals():
    print("\n\nüìä DATASET 2: energydata_complete.csv")
    print("-" * 90)
    print("‚úÖ Suitable for:")
    print("   ‚Ä¢ LINEAR REGRESSION - Predict appliance energy consumption")
    print("   ‚Ä¢ LOGISTIC REGRESSION - Binary classification (high/low energy)")
    print("   ‚Ä¢ DECISION TREES - Capture temporal patterns")
    print("   ‚Ä¢ NEURAL NETWORKS (PyTorch) - Time-series modeling")
    print("   ‚Ä¢ CLUSTERING (K-means) - Identify energy consumption patterns")
    print("\nüí° Insights:")
    print(f"   ‚Ä¢ {df2.shape[0]} time-series observations")
    print(f"   ‚Ä¢ {df2.shape[1]} features (temperature, humidity, weather data)")
    if df2.isnull().sum().sum() == 0:
        print("   ‚Ä¢ Clean data - no missing values")
    print("   ‚Ä¢ Rich feature set for complex modeling")
    print("   ‚Ä¢ Can create classification problems from regression")

print("\n\nüéØ PROJECT STRATEGY:")
print("-" * 90)
print("To maximize marks, we can:")
print("1. Use BOTH datasets to showcase different problem types")
print("2. Dataset 1: Focus on regression algorithms and model comparison")
print("3. Dataset 2: Create both regression AND classification problems")
print("4. Apply clustering to both datasets for unsupervised learning")
print("5. Use PyTorch neural networks on at least one dataset")
print("6. Compare all algorithms using appropriate metrics (MSE, R¬≤, Accuracy, etc.)")
print("\n‚úÖ This covers ALL algorithms learned in your course!")
print("=" * 90)

## 9. Save Summary for Project Documentation

In [None]:
# Create summary for documentation
summary = {
    'analysis_date': '2025-12-07',
    'datasets_analyzed': []
}

if 'df1' in locals():
    summary['datasets_analyzed'].append({
        'name': 'ENB2012_data.xlsx',
        'rows': df1.shape[0],
        'columns': df1.shape[1],
        'missing_values': df1.isnull().sum().sum(),
        'suitable_algorithms': ['Linear Regression', 'Decision Trees', 'Neural Networks', 'K-means']
    })

if 'df2' in locals():
    summary['datasets_analyzed'].append({
        'name': 'energydata_complete.csv',
        'rows': df2.shape[0],
        'columns': df2.shape[1],
        'missing_values': df2.isnull().sum().sum(),
        'suitable_algorithms': ['Linear Regression', 'Logistic Regression', 'Decision Trees', 'Neural Networks', 'K-means']
    })

print("‚úÖ Analysis complete!")
print(f"   ‚Ä¢ {len(summary['datasets_analyzed'])} datasets analyzed")
print(f"   ‚Ä¢ Ready to proceed with ML model development")
print("\nüìù Next steps:")
print("   1. Create detailed preprocessing notebooks")
print("   2. Implement regression models")
print("   3. Implement classification models")
print("   4. Implement clustering analysis")
print("   5. Implement neural networks with PyTorch")
print("   6. Compare and evaluate all models")