Jupyter Notebook for exploratory scripts

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Smart Grid Data Analysis\n",
    "\n",
    "This notebook analyzes the smart grid load monitoring data with simple visualizations.\n",
    "\n",
    "Dataset: [Smart Grid Real-Time Load Monitoring](https://www.kaggle.com/datasets/ziya07/smart-grid-real-time-load-monitoring-dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "import sys\n",
    "\n",
    "# Add project root to path\n",
    "sys.path.append('..')\n",
    "\n",
    "# Set style\n",
    "plt.style.use('default')\n",
    "sns.set_palette(\"husl\")\n",
    "\n",
    "print(\"📊 Smart Grid Data Analysis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the smart grid data\n",
    "data_path = \"../data/raw/smart_grid_data.csv\"\n",
    "\n",
    "if os.path.exists(data_path):\n",
    "    df = pd.read_csv(data_path)\n",
    "    df['timestamp'] = pd.to_datetime(df['timestamp'])\n",
    "    print(f\"✅ Loaded {len(df)} records\")\n",
    "    print(f\"📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}\")\n",
    "    print(f\"📊 Columns: {list(df.columns)}\")\n",
    "else:\n",
    "    print(\"❌ Data file not found. Please run the smart grid ingestion script first.\")\n",
    "    print(\"Run: poetry run python run.py smart_grid\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Profile Over Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 1: Load profile over time\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(df['timestamp'], df['load_mw'], linewidth=1, alpha=0.7)\n",
    "plt.title('Smart Grid Load Profile Over Time', fontsize=14, fontweight='bold')\n",
    "plt.xlabel('Time')\n",
    "plt.ylabel('Load (MW)')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Summary statistics\n",
    "print(f\"📈 Load Statistics:\")\n",
    "print(f\"   • Average: {df['load_mw'].mean():.2f} MW\")\n",
    "print(f\"   • Maximum: {df['load_mw'].max():.2f} MW\")\n",
    "print(f\"   • Minimum: {df['load_mw'].min():.2f} MW\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Load Distribution by Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 2: Load distribution by category\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "# Count load categories\n",
    "load_counts = df['load_category'].value_counts()\n",
    "\n",
    "plt.pie(load_counts.values, labels=load_counts.index, autopct='%1.1f%%', startangle=90)\n",
    "plt.title('Load Distribution by Category', fontsize=14, fontweight='bold')\n",
    "plt.axis('equal')\n",
    "plt.show()\n",
    "\n",
    "print(f\"📊 Load Category Distribution:\")\n",
    "for category, count in load_counts.items():\n",
    "    percentage = (count / len(df)) * 100\n",
    "    print(f\"   • {category}: {count} records ({percentage:.1f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Peak vs Off-Peak Hours Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 3: Peak vs Off-peak hours comparison\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "# Calculate average load by hour\n",
    "hourly_load = df.groupby('hour')['load_mw'].mean()\n",
    "\n",
    "# Create bar plot\n",
    "bars = plt.bar(hourly_load.index, hourly_load.values, alpha=0.7)\n",
    "\n",
    "# Highlight peak hours\n",
    "peak_hours = [8, 9, 17, 18, 19]\n",
    "for i, bar in enumerate(bars):\n",
    "    if i in peak_hours:\n",
    "        bar.set_color('red')\n",
    "        bar.set_alpha(0.8)\n",
    "\n",
    "plt.title('Average Load by Hour of Day', fontsize=14, fontweight='bold')\n",
    "plt.xlabel('Hour of Day')\n",
    "plt.ylabel('Average Load (MW)')\n",
    "plt.xticks(range(0, 24))\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.legend(['Off-Peak', 'Peak Hours'], loc='upper right')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Summary\n",
    "peak_avg = df[df['is_peak_hour']]['load_mw'].mean()\n",
    "off_peak_avg = df[~df['is_peak_hour']]['load_mw'].mean()\n",
    "print(f\"⚡ Peak vs Off-Peak Analysis:\")\n",
    "print(f\"   • Peak Hours Average: {peak_avg:.2f} MW\")\n",
    "print(f\"   • Off-Peak Hours Average: {off_peak_avg:.2f} MW\")\n",
    "print(f\"   • Difference: {peak_avg - off_peak_avg:.2f} MW\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Grid Performance Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 4: Grid performance metrics over time\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "fig.suptitle('Grid Performance Metrics Over Time', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Voltage\n",
    "axes[0, 0].plot(df['timestamp'], df['voltage_kv'], alpha=0.7)\n",
    "axes[0, 0].set_title('Voltage (kV)')\n",
    "axes[0, 0].set_ylabel('Voltage (kV)')\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Frequency\n",
    "axes[0, 1].plot(df['timestamp'], df['frequency_hz'], alpha=0.7, color='orange')\n",
    "axes[0, 1].set_title('Frequency (Hz)')\n",
    "axes[0, 1].set_ylabel('Frequency (Hz)')\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "# Power Factor\n",
    "axes[1, 0].plot(df['timestamp'], df['power_factor'], alpha=0.7, color='green')\n",
    "axes[1, 0].set_title('Power Factor')\n",
    "axes[1, 0].set_ylabel('Power Factor')\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Grid Stability\n",
    "axes[1, 1].plot(df['timestamp'], df['grid_stability'], alpha=0.7, color='red')\n",
    "axes[1, 1].set_title('Grid Stability')\n",
    "axes[1, 1].set_ylabel('Stability Score')\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "# Rotate x-axis labels\n",
    "for ax in axes.flat:\n",
    "    ax.tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Performance summary\n",
    "print(f\"🔧 Grid Performance Summary:\")\n",
    "print(f\"   • Average Voltage: {df['voltage_kv'].mean():.2f} kV\")\n",
    "print(f\"   • Average Frequency: {df['frequency_hz'].mean():.2f} Hz\")\n",
    "print(f\"   • Average Power Factor: {df['power_factor'].mean():.3f}\")\n",
    "print(f\"   • Average Stability: {df['grid_stability'].mean():.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Efficiency Score Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 5: Efficiency score distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "plt.hist(df['efficiency_score'], bins=30, alpha=0.7, edgecolor='black')\n",
    "plt.title('Distribution of Efficiency Scores', fontsize=14, fontweight='bold')\n",
    "plt.xlabel('Efficiency Score')\n",
    "plt.ylabel('Frequency')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.axvline(df['efficiency_score'].mean(), color='red', linestyle='--', \n",
    "            label=f'Mean: {df[\"efficiency_score\"].mean():.2f}')\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"📊 Efficiency Score Analysis:\")\n",
    "print(f\"   • Average: {df['efficiency_score'].mean():.2f}\")\n",
    "print(f\"   • Best Score: {df['efficiency_score'].max():.2f}\")\n",
    "print(f\"   • Worst Score: {df['efficiency_score'].min():.2f}\")\n",
    "print(f\"   • Best Hour: {df.loc[df['efficiency_score'].idxmax(), 'hour']}:00\")\n",
    "print(f\"   • Worst Hour: {df.loc[df['efficiency_score'].idxmin(), 'hour']}:00\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary\n",
    "\n",
    "This analysis shows:\n",
    "- **Load patterns** throughout the day\n",
    "- **Peak vs off-peak** hour differences\n",
    "- **Grid performance** metrics over time\n",
    "- **Efficiency distribution** across the dataset\n",
    "\n",
    "The visualizations help identify optimal times for grid operations and areas for improvement."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.22"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

NameError: name 'null' is not defined