## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

## 2. Locate and Verify Solar Datasets

In [None]:
# Locate datasets directory
candidate_dirs = [
    Path('..') / 'datasets' / 'solar',
    Path('.') / 'datasets' / 'solar',
    Path('datasets') / 'solar',
    Path.cwd().parent / 'datasets' / 'solar',
]

dataset_dir = None
for d in candidate_dirs:
    if d.exists() and d.is_dir():
        dataset_dir = d.resolve()
        break

if dataset_dir is None:
    dataset_dir = Path('../datasets/solar').resolve()
    print(f"‚ö†Ô∏è Solar datasets folder not found. Expected: {dataset_dir}")
else:
    print(f"‚úÖ Using solar datasets directory: {dataset_dir}")

# Expected files
expected_files = [
    'Plant_1_Generation_Data.csv',
    'Plant_1_Weather_Sensor_Data.csv',
    'Plant_2_Generation_Data.csv',
    'Plant_2_Weather_Sensor_Data.csv'
]

print("\n" + "="*80)
print("SOLAR DATASET INVENTORY")
print("="*80)

dataset_info = []
for fname in expected_files:
    fpath = dataset_dir / fname
    if fpath.exists():
        size_kb = fpath.stat().st_size / 1024
        # Quick row count
        with open(fpath, 'r') as f:
            row_count = sum(1 for _ in f) - 1  # exclude header
        
        dataset_info.append({
            'Dataset': fname,
            'Rows': f"{row_count:,}",
            'Size (KB)': f"{size_kb:.2f}",
            'Status': '‚úÖ Found'
        })
        print(f"\n‚úÖ {fname}")
        print(f"   Rows: {row_count:,}")
        print(f"   Size: {size_kb:.2f} KB")
    else:
        dataset_info.append({
            'Dataset': fname,
            'Rows': 'N/A',
            'Size (KB)': 'N/A',
            'Status': '‚ùå Not Found'
        })
        print(f"\n‚ùå {fname} - NOT FOUND")

print("\n" + "="*80)
df_inventory = pd.DataFrame(dataset_info)
print("\nüìã SUMMARY TABLE:")
print(df_inventory.to_string(index=False))
print("\n" + "="*80)

## 3. Load Plant 1 Datasets

In [None]:
# Load Plant 1 Generation Data
print("="*80)
print("LOADING PLANT 1 - GENERATION DATA")
print("="*80)

try:
    df_p1_gen = pd.read_csv(dataset_dir / 'Plant_1_Generation_Data.csv')
    
    # Convert DATE_TIME to datetime
    df_p1_gen['DATE_TIME'] = pd.to_datetime(df_p1_gen['DATE_TIME'], format='%d-%m-%Y %H:%M')
    
    print(f"\nüìê Shape: {df_p1_gen.shape[0]:,} rows √ó {df_p1_gen.shape[1]} columns")
    print(f"\nüìÖ Date Range: {df_p1_gen['DATE_TIME'].min()} to {df_p1_gen['DATE_TIME'].max()}")
    print(f"\nüìä Columns: {list(df_p1_gen.columns)}")
    
    print("\nüîç First 5 rows:")
    display(df_p1_gen.head())
    
    print("\nüìä Data Types:")
    display(df_p1_gen.dtypes)
    
    print("\nüìà Basic Statistics:")
    display(df_p1_gen.describe())
    
    print("\n‚úÖ Missing Values Check:")
    missing_p1_gen = df_p1_gen.isnull().sum()
    if missing_p1_gen.sum() == 0:
        print("   No missing values found!")
    else:
        display(missing_p1_gen[missing_p1_gen > 0])
    
    print(f"\nüî¢ Unique Inverters (SOURCE_KEY): {df_p1_gen['SOURCE_KEY'].nunique()}")
    print(f"üî¢ Unique Plants: {df_p1_gen['PLANT_ID'].nunique()}")
    
except Exception as e:
    print(f"‚ùå Error loading Plant 1 Generation Data: {e}")

In [None]:
# Load Plant 1 Weather Data
print("="*80)
print("LOADING PLANT 1 - WEATHER SENSOR DATA")
print("="*80)

try:
    df_p1_weather = pd.read_csv(dataset_dir / 'Plant_1_Weather_Sensor_Data.csv')
    
    # Convert DATE_TIME to datetime
    df_p1_weather['DATE_TIME'] = pd.to_datetime(df_p1_weather['DATE_TIME'])
    
    print(f"\nüìê Shape: {df_p1_weather.shape[0]:,} rows √ó {df_p1_weather.shape[1]} columns")
    print(f"\nüìÖ Date Range: {df_p1_weather['DATE_TIME'].min()} to {df_p1_weather['DATE_TIME'].max()}")
    print(f"\nüìä Columns: {list(df_p1_weather.columns)}")
    
    print("\nüîç First 5 rows:")
    display(df_p1_weather.head())
    
    print("\nüìä Data Types:")
    display(df_p1_weather.dtypes)
    
    print("\nüìà Basic Statistics:")
    display(df_p1_weather.describe())
    
    print("\n‚úÖ Missing Values Check:")
    missing_p1_weather = df_p1_weather.isnull().sum()
    if missing_p1_weather.sum() == 0:
        print("   No missing values found!")
    else:
        display(missing_p1_weather[missing_p1_weather > 0])
    
    print(f"\nüî¢ Unique Weather Sensors: {df_p1_weather['SOURCE_KEY'].nunique()}")
    print(f"‚è±Ô∏è  Measurement Interval: 15 minutes")
    
except Exception as e:
    print(f"‚ùå Error loading Plant 1 Weather Data: {e}")

## 4. Load Plant 2 Datasets

In [None]:
# Load Plant 2 Generation Data
print("="*80)
print("LOADING PLANT 2 - GENERATION DATA")
print("="*80)

try:
    df_p2_gen = pd.read_csv(dataset_dir / 'Plant_2_Generation_Data.csv')
    
    # Convert DATE_TIME to datetime
    df_p2_gen['DATE_TIME'] = pd.to_datetime(df_p2_gen['DATE_TIME'])
    
    print(f"\nüìê Shape: {df_p2_gen.shape[0]:,} rows √ó {df_p2_gen.shape[1]} columns")
    print(f"\nüìÖ Date Range: {df_p2_gen['DATE_TIME'].min()} to {df_p2_gen['DATE_TIME'].max()}")
    print(f"\nüìä Columns: {list(df_p2_gen.columns)}")
    
    print("\nüîç First 5 rows:")
    display(df_p2_gen.head())
    
    print("\nüìä Data Types:")
    display(df_p2_gen.dtypes)
    
    print("\nüìà Basic Statistics:")
    display(df_p2_gen.describe())
    
    print("\n‚úÖ Missing Values Check:")
    missing_p2_gen = df_p2_gen.isnull().sum()
    if missing_p2_gen.sum() == 0:
        print("   No missing values found!")
    else:
        display(missing_p2_gen[missing_p2_gen > 0])
    
    print(f"\nüî¢ Unique Inverters (SOURCE_KEY): {df_p2_gen['SOURCE_KEY'].nunique()}")
    print(f"üî¢ Unique Plants: {df_p2_gen['PLANT_ID'].nunique()}")
    
except Exception as e:
    print(f"‚ùå Error loading Plant 2 Generation Data: {e}")

In [None]:
# Load Plant 2 Weather Data
print("="*80)
print("LOADING PLANT 2 - WEATHER SENSOR DATA")
print("="*80)

try:
    df_p2_weather = pd.read_csv(dataset_dir / 'Plant_2_Weather_Sensor_Data.csv')
    
    # Convert DATE_TIME to datetime
    df_p2_weather['DATE_TIME'] = pd.to_datetime(df_p2_weather['DATE_TIME'])
    
    print(f"\nüìê Shape: {df_p2_weather.shape[0]:,} rows √ó {df_p2_weather.shape[1]} columns")
    print(f"\nüìÖ Date Range: {df_p2_weather['DATE_TIME'].min()} to {df_p2_weather['DATE_TIME'].max()}")
    print(f"\nüìä Columns: {list(df_p2_weather.columns)}")
    
    print("\nüîç First 5 rows:")
    display(df_p2_weather.head())
    
    print("\nüìä Data Types:")
    display(df_p2_weather.dtypes)
    
    print("\nüìà Basic Statistics:")
    display(df_p2_weather.describe())
    
    print("\n‚úÖ Missing Values Check:")
    missing_p2_weather = df_p2_weather.isnull().sum()
    if missing_p2_weather.sum() == 0:
        print("   No missing values found!")
    else:
        display(missing_p2_weather[missing_p2_weather > 0])
    
    print(f"\nüî¢ Unique Weather Sensors: {df_p2_weather['SOURCE_KEY'].nunique()}")
    print(f"‚è±Ô∏è  Measurement Interval: 15 minutes")
    
except Exception as e:
    print(f"‚ùå Error loading Plant 2 Weather Data: {e}")

## 5. Merge Generation and Weather Data for Analysis

To perform steady-state estimation, we need to merge weather conditions with power output at matching timestamps.

In [None]:
# Merge Plant 1 datasets
print("="*80)
print("MERGING PLANT 1 - GENERATION + WEATHER DATA")
print("="*80)

try:
    # Aggregate generation data by timestamp (sum across all inverters)
    df_p1_gen_agg = df_p1_gen.groupby('DATE_TIME').agg({
        'DC_POWER': 'sum',
        'AC_POWER': 'sum',
        'DAILY_YIELD': 'sum',
        'TOTAL_YIELD': 'sum'
    }).reset_index()
    
    # Merge with weather data
    df_p1_merged = pd.merge(
        df_p1_gen_agg,
        df_p1_weather[['DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']],
        on='DATE_TIME',
        how='inner'
    )
    
    print(f"\n‚úÖ Plant 1 Merged Dataset Created")
    print(f"   Shape: {df_p1_merged.shape[0]:,} rows √ó {df_p1_merged.shape[1]} columns")
    print(f"   Columns: {list(df_p1_merged.columns)}")
    
    print("\nüîç Sample of Merged Data:")
    display(df_p1_merged.head(10))
    
    print("\nüìà Statistics:")
    display(df_p1_merged.describe())
    
except Exception as e:
    print(f"‚ùå Error merging Plant 1 data: {e}")

In [None]:
# Merge Plant 2 datasets
print("="*80)
print("MERGING PLANT 2 - GENERATION + WEATHER DATA")
print("="*80)

try:
    # Aggregate generation data by timestamp (sum across all inverters)
    df_p2_gen_agg = df_p2_gen.groupby('DATE_TIME').agg({
        'DC_POWER': 'sum',
        'AC_POWER': 'sum',
        'DAILY_YIELD': 'sum',
        'TOTAL_YIELD': 'sum'
    }).reset_index()
    
    # Merge with weather data
    df_p2_merged = pd.merge(
        df_p2_gen_agg,
        df_p2_weather[['DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']],
        on='DATE_TIME',
        how='inner'
    )
    
    print(f"\n‚úÖ Plant 2 Merged Dataset Created")
    print(f"   Shape: {df_p2_merged.shape[0]:,} rows √ó {df_p2_merged.shape[1]} columns")
    print(f"   Columns: {list(df_p2_merged.columns)}")
    
    print("\nüîç Sample of Merged Data:")
    display(df_p2_merged.head(10))
    
    print("\nüìà Statistics:")
    display(df_p2_merged.describe())
    
except Exception as e:
    print(f"‚ùå Error merging Plant 2 data: {e}")

## 6. Correlation Analysis - Plant 1

Analyze relationships between weather conditions and power output.

In [None]:
# Plant 1 Correlation Analysis
if 'df_p1_merged' in locals():
    print("="*80)
    print("PLANT 1 - CORRELATION ANALYSIS")
    print("="*80)
    
    # Select numeric columns for correlation (exclude DATE_TIME)
    numeric_cols = df_p1_merged.select_dtypes(include=[np.number]).columns
    
    # Calculate correlation matrix
    corr_matrix_p1 = df_p1_merged[numeric_cols].corr()
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix_p1, annot=True, fmt='.3f', cmap='RdYlGn', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix - Plant 1 (Weather vs Power Output)', 
              fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Show correlations with AC_POWER (target variable)
    print("\nüéØ Correlations with AC_POWER (Target Variable):")
    ac_power_corr = corr_matrix_p1['AC_POWER'].sort_values(ascending=False)
    display(ac_power_corr)
    
    print("\nüí° KEY INSIGHTS:")
    print(f"   ‚Ä¢ IRRADIATION ‚Üí AC_POWER: {corr_matrix_p1.loc['IRRADIATION', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ AMBIENT_TEMPERATURE ‚Üí AC_POWER: {corr_matrix_p1.loc['AMBIENT_TEMPERATURE', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ MODULE_TEMPERATURE ‚Üí AC_POWER: {corr_matrix_p1.loc['MODULE_TEMPERATURE', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ DC_POWER ‚Üí AC_POWER: {corr_matrix_p1.loc['DC_POWER', 'AC_POWER']:.3f}")

## 7. Correlation Analysis - Plant 2

In [None]:
# Plant 2 Correlation Analysis
if 'df_p2_merged' in locals():
    print("="*80)
    print("PLANT 2 - CORRELATION ANALYSIS")
    print("="*80)
    
    # Select numeric columns for correlation
    numeric_cols = df_p2_merged.select_dtypes(include=[np.number]).columns
    
    # Calculate correlation matrix
    corr_matrix_p2 = df_p2_merged[numeric_cols].corr()
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix_p2, annot=True, fmt='.3f', cmap='RdYlGn', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix - Plant 2 (Weather vs Power Output)', 
              fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Show correlations with AC_POWER (target variable)
    print("\nüéØ Correlations with AC_POWER (Target Variable):")
    ac_power_corr = corr_matrix_p2['AC_POWER'].sort_values(ascending=False)
    display(ac_power_corr)
    
    print("\nüí° KEY INSIGHTS:")
    print(f"   ‚Ä¢ IRRADIATION ‚Üí AC_POWER: {corr_matrix_p2.loc['IRRADIATION', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ AMBIENT_TEMPERATURE ‚Üí AC_POWER: {corr_matrix_p2.loc['AMBIENT_TEMPERATURE', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ MODULE_TEMPERATURE ‚Üí AC_POWER: {corr_matrix_p2.loc['MODULE_TEMPERATURE', 'AC_POWER']:.3f}")
    print(f"   ‚Ä¢ DC_POWER ‚Üí AC_POWER: {corr_matrix_p2.loc['DC_POWER', 'AC_POWER']:.3f}")

## 8. Visualization - Power Output vs Weather Conditions

In [None]:
# Plant 1 - Power vs Irradiation
if 'df_p1_merged' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # AC Power vs Irradiation
    axes[0, 0].scatter(df_p1_merged['IRRADIATION'], df_p1_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='blue')
    axes[0, 0].set_xlabel('Irradiation (W/m¬≤)', fontsize=12)
    axes[0, 0].set_ylabel('AC Power (W)', fontsize=12)
    axes[0, 0].set_title('Plant 1: AC Power vs Irradiation', fontsize=14, fontweight='bold')
    axes[0, 0].grid(True, alpha=0.3)
    
    # AC Power vs Ambient Temperature
    axes[0, 1].scatter(df_p1_merged['AMBIENT_TEMPERATURE'], df_p1_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='red')
    axes[0, 1].set_xlabel('Ambient Temperature (¬∞C)', fontsize=12)
    axes[0, 1].set_ylabel('AC Power (W)', fontsize=12)
    axes[0, 1].set_title('Plant 1: AC Power vs Ambient Temperature', fontsize=14, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    
    # AC Power vs Module Temperature
    axes[1, 0].scatter(df_p1_merged['MODULE_TEMPERATURE'], df_p1_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='green')
    axes[1, 0].set_xlabel('Module Temperature (¬∞C)', fontsize=12)
    axes[1, 0].set_ylabel('AC Power (W)', fontsize=12)
    axes[1, 0].set_title('Plant 1: AC Power vs Module Temperature', fontsize=14, fontweight='bold')
    axes[1, 0].grid(True, alpha=0.3)
    
    # DC Power vs AC Power
    axes[1, 1].scatter(df_p1_merged['DC_POWER'], df_p1_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='purple')
    axes[1, 1].set_xlabel('DC Power (W)', fontsize=12)
    axes[1, 1].set_ylabel('AC Power (W)', fontsize=12)
    axes[1, 1].set_title('Plant 1: DC Power vs AC Power', fontsize=14, fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plant 2 - Power vs Weather Conditions
if 'df_p2_merged' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # AC Power vs Irradiation
    axes[0, 0].scatter(df_p2_merged['IRRADIATION'], df_p2_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='blue')
    axes[0, 0].set_xlabel('Irradiation (W/m¬≤)', fontsize=12)
    axes[0, 0].set_ylabel('AC Power (W)', fontsize=12)
    axes[0, 0].set_title('Plant 2: AC Power vs Irradiation', fontsize=14, fontweight='bold')
    axes[0, 0].grid(True, alpha=0.3)
    
    # AC Power vs Ambient Temperature
    axes[0, 1].scatter(df_p2_merged['AMBIENT_TEMPERATURE'], df_p2_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='red')
    axes[0, 1].set_xlabel('Ambient Temperature (¬∞C)', fontsize=12)
    axes[0, 1].set_ylabel('AC Power (W)', fontsize=12)
    axes[0, 1].set_title('Plant 2: AC Power vs Ambient Temperature', fontsize=14, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    
    # AC Power vs Module Temperature
    axes[1, 0].scatter(df_p2_merged['MODULE_TEMPERATURE'], df_p2_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='green')
    axes[1, 0].set_xlabel('Module Temperature (¬∞C)', fontsize=12)
    axes[1, 0].set_ylabel('AC Power (W)', fontsize=12)
    axes[1, 0].set_title('Plant 2: AC Power vs Module Temperature', fontsize=14, fontweight='bold')
    axes[1, 0].grid(True, alpha=0.3)
    
    # DC Power vs AC Power
    axes[1, 1].scatter(df_p2_merged['DC_POWER'], df_p2_merged['AC_POWER'], 
                       alpha=0.5, s=10, c='purple')
    axes[1, 1].set_xlabel('DC Power (W)', fontsize=12)
    axes[1, 1].set_ylabel('AC Power (W)', fontsize=12)
    axes[1, 1].set_title('Plant 2: DC Power vs AC Power', fontsize=14, fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 9. Time-Series Visualization (For Context Only)

While we treat this as steady-state estimation, visualizing temporal patterns helps understand data characteristics.

In [None]:
# Plant 1 - Time Series Overview (First 7 days)
if 'df_p1_merged' in locals():
    # Filter first week for clarity
    df_p1_week = df_p1_merged[df_p1_merged['DATE_TIME'] < df_p1_merged['DATE_TIME'].min() + pd.Timedelta(days=7)]
    
    fig, axes = plt.subplots(3, 1, figsize=(16, 12))
    
    # AC Power over time
    axes[0].plot(df_p1_week['DATE_TIME'], df_p1_week['AC_POWER'], color='blue', linewidth=1)
    axes[0].set_ylabel('AC Power (W)', fontsize=12)
    axes[0].set_title('Plant 1: AC Power Output (First 7 Days)', fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Irradiation over time
    axes[1].plot(df_p1_week['DATE_TIME'], df_p1_week['IRRADIATION'], color='orange', linewidth=1)
    axes[1].set_ylabel('Irradiation (W/m¬≤)', fontsize=12)
    axes[1].set_title('Plant 1: Solar Irradiation (First 7 Days)', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    # Temperature over time
    axes[2].plot(df_p1_week['DATE_TIME'], df_p1_week['AMBIENT_TEMPERATURE'], 
                 color='red', linewidth=1, label='Ambient Temp')
    axes[2].plot(df_p1_week['DATE_TIME'], df_p1_week['MODULE_TEMPERATURE'], 
                 color='green', linewidth=1, label='Module Temp')
    axes[2].set_xlabel('Date & Time', fontsize=12)
    axes[2].set_ylabel('Temperature (¬∞C)', fontsize=12)
    axes[2].set_title('Plant 1: Temperature Readings (First 7 Days)', fontsize=14, fontweight='bold')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 10. Distribution Analysis

In [None]:
# Distribution plots for Plant 1
if 'df_p1_merged' in locals():
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    
    # AC Power
    axes[0, 0].hist(df_p1_merged['AC_POWER'], bins=50, color='blue', alpha=0.7, edgecolor='black')
    axes[0, 0].set_xlabel('AC Power (W)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution: AC Power')
    axes[0, 0].grid(True, alpha=0.3)
    
    # DC Power
    axes[0, 1].hist(df_p1_merged['DC_POWER'], bins=50, color='purple', alpha=0.7, edgecolor='black')
    axes[0, 1].set_xlabel('DC Power (W)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Distribution: DC Power')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Irradiation
    axes[0, 2].hist(df_p1_merged['IRRADIATION'], bins=50, color='orange', alpha=0.7, edgecolor='black')
    axes[0, 2].set_xlabel('Irradiation (W/m¬≤)')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].set_title('Distribution: Irradiation')
    axes[0, 2].grid(True, alpha=0.3)
    
    # Ambient Temperature
    axes[1, 0].hist(df_p1_merged['AMBIENT_TEMPERATURE'], bins=50, color='red', alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('Ambient Temperature (¬∞C)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Distribution: Ambient Temperature')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Module Temperature
    axes[1, 1].hist(df_p1_merged['MODULE_TEMPERATURE'], bins=50, color='green', alpha=0.7, edgecolor='black')
    axes[1, 1].set_xlabel('Module Temperature (¬∞C)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Distribution: Module Temperature')
    axes[1, 1].grid(True, alpha=0.3)
    
    # Daily Yield
    axes[1, 2].hist(df_p1_merged['DAILY_YIELD'], bins=50, color='teal', alpha=0.7, edgecolor='black')
    axes[1, 2].set_xlabel('Daily Yield (Wh)')
    axes[1, 2].set_ylabel('Frequency')
    axes[1, 2].set_title('Distribution: Daily Yield')
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.suptitle('Plant 1 - Feature Distributions', fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.show()

## 11. Summary and ML Algorithm Recommendations

In [None]:
print("="*90)
print("SUMMARY & MACHINE LEARNING ALGORITHM RECOMMENDATIONS")
print("="*90)

if 'df_p1_merged' in locals() and 'df_p2_merged' in locals():
    print("\nüìä DATASETS ANALYZED:")
    print("-"*90)
    print(f"‚úÖ Plant 1 Merged Data: {df_p1_merged.shape[0]:,} samples √ó {df_p1_merged.shape[1]} features")
    print(f"‚úÖ Plant 2 Merged Data: {df_p2_merged.shape[0]:,} samples √ó {df_p2_merged.shape[1]} features")
    
    print("\n\nüéØ PROBLEM FORMULATION: Steady-State Estimation")
    print("-"*90)
    print("Objective: Predict instantaneous AC power from instantaneous weather conditions")
    print("Mathematical Model: AC_POWER(t) = f(IRRADIATION(t), TEMP_AMBIENT(t), TEMP_MODULE(t))")
    print("\nThis is NOT time-series forecasting. We map current weather ‚Üí current power.")
    
    print("\n\nüß† SUITABLE ML ALGORITHMS:")
    print("-"*90)
    print("\n1Ô∏è‚É£  LINEAR REGRESSION")
    print("   Purpose: Baseline model for AC power prediction")
    print("   Features: IRRADIATION, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE")
    print("   Target: AC_POWER")
    print("   Justification: Simple, interpretable, matches course material")
    
    print("\n2Ô∏è‚É£  POLYNOMIAL REGRESSION")
    print("   Purpose: Capture non-linear relationships (temperature effects)")
    print("   Features: Polynomial features (degree 2-3) of weather variables")
    print("   Target: AC_POWER")
    print("   Justification: Solar panels have non-linear efficiency curves")
    
    print("\n3Ô∏è‚É£  DECISION TREES / RANDOM FOREST")
    print("   Purpose: Handle feature interactions without manual engineering")
    print("   Features: All weather + DC_POWER")
    print("   Target: AC_POWER")
    print("   Justification: Can model complex decision boundaries")
    
    print("\n4Ô∏è‚É£  NEURAL NETWORKS (PyTorch)")
    print("   Purpose: Deep learning approach for power estimation")
    print("   Architecture: Input(3) ‚Üí Hidden(64) ‚Üí Hidden(32) ‚Üí Output(1)")
    print("   Features: IRRADIATION, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE")
    print("   Target: AC_POWER")
    print("   Justification: Demonstrates modern AI techniques")
    
    print("\n5Ô∏è‚É£  K-MEANS CLUSTERING")
    print("   Purpose: Identify operational patterns (night/day/peak)")
    print("   Features: AC_POWER, IRRADIATION, TEMPERATURE")
    print("   Clusters: 3-5 (e.g., Night, Low-Sun, High-Sun, Peak)")
    print("   Justification: Unsupervised learning component")
    
    print("\n6Ô∏è‚É£  LOGISTIC REGRESSION (Classification)")
    print("   Purpose: Binary classification (High/Low power output)")
    print("   Features: Weather conditions")
    print("   Target: Power category (High = above median, Low = below median)")
    print("   Justification: Demonstrates classification capability")
    
    print("\n\nüìà EVALUATION METRICS:")
    print("-"*90)
    print("Regression Models: MSE, RMSE, MAE, R¬≤")
    print("Classification Models: Accuracy, Precision, Recall, F1-Score")
    print("Clustering: Silhouette Score, Inertia")
    
    print("\n\nüí° KEY INSIGHTS FROM EDA:")
    print("-"*90)
    if 'corr_matrix_p1' in locals():
        print(f"‚Ä¢ IRRADIATION is strongly correlated with AC_POWER (r = {corr_matrix_p1.loc['IRRADIATION', 'AC_POWER']:.3f})")
    print("‚Ä¢ Temperature has moderate correlation with power output")
    print("‚Ä¢ DC_POWER and AC_POWER are highly correlated (inverter efficiency)")
    print("‚Ä¢ Data is clean with no missing values")
    print("‚Ä¢ Both plants have similar patterns, allowing model validation")
    
    print("\n\n‚úÖ NEXT STEPS:")
    print("-"*90)
    print("1. Create preprocessing pipeline (scaling, train-test split)")
    print("2. Implement Linear Regression baseline")
    print("3. Implement Polynomial Regression")
    print("4. Implement Decision Trees/Random Forest")
    print("5. Implement Neural Network (PyTorch)")
    print("6. Implement K-Means Clustering")
    print("7. Create classification labels and train Logistic Regression")
    print("8. Compare all models and generate final report")
    
    print("\n" + "="*90)
    print("EDA COMPLETE - Ready for Model Development!")
    print("="*90)

else:
    print("‚ö†Ô∏è Merged datasets not available. Please run previous cells.")

## 12. Save Processed Data for Model Training

In [None]:
# Save merged datasets for use in subsequent notebooks
output_dir = Path('..') / 'datasets' / 'solar' / 'processed'
output_dir.mkdir(parents=True, exist_ok=True)

if 'df_p1_merged' in locals():
    output_path_p1 = output_dir / 'plant1_merged.csv'
    df_p1_merged.to_csv(output_path_p1, index=False)
    print(f"‚úÖ Plant 1 merged data saved: {output_path_p1}")

if 'df_p2_merged' in locals():
    output_path_p2 = output_dir / 'plant2_merged.csv'
    df_p2_merged.to_csv(output_path_p2, index=False)
    print(f"‚úÖ Plant 2 merged data saved: {output_path_p2}")

print("\nüéâ All processed datasets saved successfully!")