# Exploratory Data Analysis (EDA) and CNN Implementation

This notebook demonstrates:
1. Basic Exploratory Data Analysis (EDA) techniques
2. Convolutional Neural Network (CNN) implementation
3. Practical example with plant emissions data

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display versions for reproducibility
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"TensorFlow version: {tf.__version__}")

## Section 1: Data Loading and Initial Exploration

In [None]:
# Load the dataset (using a sample approach for large files)
file_path = 'Notebooks/Data/global_hybrid_plants_emissions_2000_2024.csv'

# Read the first few rows to understand the structure
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading file: {e}")
    # Create a sample dataset for demonstration
    np.random.seed(42)
    years = np.arange(2000, 2025)
    plants = ['Plant_A', 'Plant_B', 'Plant_C', 'Plant_D', 'Plant_E']
    
    data = []
    for year in years:
        for plant in plants:
            data.append({
                'Year': year,
                'Plant_Name': plant,
                'CO2_Emissions': np.random.normal(1000, 200),
                'Methane_Emissions': np.random.normal(50, 10),
                'Nitrous_Oxide': np.random.normal(20, 5),
                'Energy_Production': np.random.normal(500, 100),
                'Efficiency_Rate': np.random.uniform(0.3, 0.9),
                'Category': np.random.choice(['High', 'Medium', 'Low'])
            })
    
    df = pd.DataFrame(data)
    print("Created sample dataset for demonstration")

# Display basic information about the dataset
print("\n=== Dataset Info ===")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== First 5 rows ===")
display(df.head())

## Section 2: Basic Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values
print("=== Missing Values ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found!")

# Basic statistics
print("\n=== Descriptive Statistics ===")
display(df.describe())

# Data types
print("\n=== Data Types ===")
display(df.dtypes)

In [None]:
# Distribution analysis for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

print(f"=== Numerical columns: {list(numerical_cols)} ===")

# Plot histograms for numerical features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        df[col].hist(bins=20, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Categorical data analysis
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"=== Categorical columns: {list(categorical_cols)} ===")

# Plot categorical distributions
if len(categorical_cols) > 0:
    fig, axes = plt.subplots(1, min(len(categorical_cols), 3), figsize=(15, 5))
    if len(categorical_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(categorical_cols[:3]):
        df[col].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis
print("=== Correlation Matrix ===")
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# Display strong correlations (|r| > 0.5)
strong_correlations = correlation_matrix.unstack()
strong_correlations = strong_correlations[abs(strong_correlations) > 0.5]
strong_correlations = strong_correlations[strong_correlations != 1]
if len(strong_correlations) > 0:
    print("\nStrong correlations found:")
    display(strong_correlations.drop_duplicates())

In [None]:
# Time series analysis if Year column exists
if 'Year' in df.columns:
    print("=== Time Series Analysis ===")
    
    # Group by year and calculate means
    yearly_data = df.groupby('Year').mean(numeric_only=True)
    
    # Plot trends over time
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.ravel()
    
    # Select a few key metrics to plot
    metrics = ['CO2_Emissions', 'Energy_Production', 'Efficiency_Rate']
    if 'Methane_Emissions' in yearly_data.columns:
        metrics.append('Methane_Emissions')
    
    for i, metric in enumerate(metrics[:4]):
        if metric in yearly_data.columns:
            yearly_data[metric].plot(ax=axes[i], marker='o')
            axes[i].set_title(f'{metric} Trend Over Time')
            axes[i].set_xlabel('Year')
            axes[i].set_ylabel(metric)
            axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Year-over-year percentage change
    print("\nYear-over-Year Percentage Change:")
    yoy_change = yearly_data.pct_change() * 100
    display(yoy_change.round(2))

## Section 3: Data Preprocessing for CNN

In [None]:
# Prepare data for CNN implementation
print("=== Data Preprocessing for CNN ===")

# Create a classification target based on efficiency
df['Efficiency_Category'] = pd.cut(
    df['Efficiency_Rate'], 
    bins=[0, 0.33, 0.66, 1], 
    labels=['Low', 'Medium', 'High']
)

# Encode categorical variables
df_encoded = df.copy()
for col in categorical_cols:
    if col != 'Efficiency_Category':
        df_encoded[col] = pd.Categorical(df[col]).codes

# Prepare features and target
X = df_encoded.drop(['Efficiency_Category', 'Efficiency_Rate'], axis=1, errors='ignore')
y = df_encoded['Efficiency_Category']

# Handle missing values in X
X = X.fillna(X.mean())

# Encode target variable
y_encoded = pd.Categorical(y).codes

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Unique classes: {np.unique(y)}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining set size: {X_train_scaled.shape[0]} samples")
print(f"Testing set size: {X_test_scaled.shape[0]} samples")

display(X_train_scaled[:5])

## Section 4: CNN Implementation

In [None]:
# Reshape data for CNN (treating features as a 1D image)
n_features = X_train_scaled.shape[1]
height = 1
width = n_features
channels = 1

# Reshape the data
X_train_cnn = X_train_scaled.reshape(-1, height, width, channels)
X_test_cnn = X_test_scaled.reshape(-1, height, width, channels)

print(f"Reshaped training data: {X_train_cnn.shape}")
print(f"Reshaped testing data: {X_test_cnn.shape}")

# Define CNN model
def create_cnn_model(input_shape, num_classes):
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # First convolutional block
        layers.Conv2D(32, (1, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((1, 2)),
        
        # Second convolutional block
        layers.Conv2D(64, (1, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((1, 2)),
        
        # Flatten and dense layers
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Create the model
num_classes = len(np.unique(y_train))
cnn_model = create_cnn_model((height, width, channels), num_classes)

# Compile the model
cnn_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
print("=== CNN Model Architecture ===")
cnn_model.summary()

In [None]:
# Train the CNN model
print("=== Training CNN Model ===")

history = cnn_model.fit(
    X_train_cnn, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
    ]
)

# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot training & validation accuracy values
ax1.plot(history.history['accuracy'])
ax1.plot(history.history['val_accuracy'])
ax1.set_title('Model Accuracy')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Epoch')
ax1.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_title('Model Loss')
ax2.set_ylabel('Loss')
ax2.set_xlabel('Epoch')
ax2.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# Evaluate the model
print("=== Model Evaluation ===")

# Make predictions
y_pred = cnn_model.predict(X_test_cnn)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate accuracy
test_accuracy = np.mean(y_pred_classes == y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification report
print("\nClassification Report:")
class_names = ['Low', 'Medium', 'High']
print(classification_report(
    y_test, 
    y_pred_classes, 
    target_names=class_names
))

In [None]:
# Confusion matrix
print("=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance analysis (simplified for CNN)
print("=== Feature Analysis ===")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(cnn_model.layers[-2].get_weights()[0]).mean(axis=1)
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='Importance', y='Feature')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## Section 5: Advanced CNN Visualization

In [None]:
# Visualize intermediate layers
print("=== CNN Layer Visualization ===")

# Create a model that outputs intermediate layers
layer_outputs = [layer.output for layer in cnn_model.layers if 'conv' in layer.name or 'pool' in layer.name]
activation_model = keras.models.Model(inputs=cnn_model.input, outputs=layer_outputs)

# Get activations for a test sample
sample_idx = 0
sample = X_test_cnn[sample_idx:sample_idx+1]
activations = activation_model.predict(sample)

# Visualize first convolutional layer activations
first_conv_layer = activations[0]  # First conv layer output
print(f"First conv layer activation shape: {first_conv_layer.shape}")

# Plot some filters
n_filters_to_plot = min(16, first_conv_layer.shape[-1])
plt.figure(figsize=(12, 8))
for i in range(n_filters_to_plot):
    plt.subplot(4, 4, i+1)
    plt.imshow(first_conv_layer[0, 0, :, i], cmap='viridis')
    plt.title(f'Filter {i+1}')
    plt.axis('off')
plt.tight_layout()
plt.suptitle('First Convolutional Layer Activations', fontsize=16, y=1.02)
plt.show()

## Section 6: Model Comparison and Analysis

In [None]:
# Compare with a simple Dense model
print("=== Model Comparison: Dense Network ===")

# Create a simple Dense model
dense_model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

dense_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the Dense model
dense_history = dense_model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
    ]
)

# Evaluate both models
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test_cnn, y_test, verbose=0)
dense_test_loss, dense_test_acc = dense_model.evaluate(X_test_scaled, y_test, verbose=0)

print(f"CNN Model - Test Accuracy: {cnn_test_acc:.4f}, Test Loss: {cnn_test_loss:.4f}")
print(f"Dense Model - Test Accuracy: {dense_test_acc:.4f}, Test Loss: {dense_test_loss:.4f}")

# Plot comparison
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['val_accuracy'], label='CNN')
plt.plot(dense_history.history['val_accuracy'], label='Dense')
plt.title('Validation Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.bar(['CNN', 'Dense'], [cnn_test_acc, dense_test_acc])
plt.title('Test Accuracy Comparison')
plt.ylabel('Test Accuracy')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## Section 7: Conclusions and Insights

In [None]:
# Summary and insights
print("=== EDA and CNN Implementation Summary ===")
print("\nKey Insights from EDA:")
print("1. Dataset Overview:")
print(f"   - Total samples: {len(df)}")
print(f"   - Number of features: {len(df.columns)}")
print(f"   - Time period: {df['Year'].min()} to {df['Year'].max()}" if 'Year' in df.columns else "   - No time period data available")
print("\n2. Data Quality:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicate rows: {df.duplicated().sum()}")
print("\n3. Key Findings:")

# Calculate some key statistics
if 'CO2_Emissions' in df.columns:
    avg_co2 = df['CO2_Emissions'].mean()
    max_co2 = df['CO2_Emissions'].max()
    min_co2 = df['CO2_Emissions'].min()
    print(f"   - Average CO2 emissions: {avg_co2:.2f}")
    print(f"   - CO2 emissions range: {min_co2:.2f} - {max_co2:.2f}")

if 'Efficiency_Rate' in df.columns:
    avg_efficiency = df['Efficiency_Rate'].mean()
    print(f"   - Average efficiency rate: {avg_efficiency:.2%}")

print(f"\n4. Model Performance:")
print(f"   - CNN Test Accuracy: {cnn_test_acc:.4f}")
print(f"   - Dense Network Test Accuracy: {dense_test_acc:.4f}")
print(f"   - Performance difference: {abs(cnn_test_acc - dense_test_acc):.4f}")

if abs(cnn_test_acc - dense_test_acc) < 0.05:
    print("   - Note: Both models performed similarly, suggesting the problem might")
    print("     be better suited for traditional machine learning approaches.")
else:
    better_model = "CNN" if cnn_test_acc > dense_test_acc else "Dense Network"
    print(f"   - The {better_model} performed better for this classification task.")

print("\n5. Recommendations:")
print("   - Consider feature engineering to improve model performance")
print("   - Experiment with different CNN architectures (deeper/wider)")
print("   - Try different activation functions and optimizers")
print("   - Implement cross-validation for more robust evaluation")
print("   - Consider ensemble methods to combine multiple models")