In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CO2 Emissions Prediction for Climate Action (SDG 13)\n",
    "## Machine Learning Project for Sustainable Development Goals\n",
    "\n",
    "**Objective**: Predict CO2 emissions using economic and demographic factors to support climate policy decisions."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Setup and Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LinearRegression, Ridge\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.metrics import mean_absolute_error, r2_score\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import ipywidgets as widgets\n",
    "from IPython.display import display\n",
    "import joblib\n",
    "\n",
    "# Configure visualizations\n",
    "%matplotlib inline\n",
    "plt.style.use('ggplot')\n",
    "sns.set_palette('viridis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataset\n",
    "try:\n",
    "    url = \"https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv\"\n",
    "    data = pd.read_csv(url)\n",
    "    print(\"Dataset loaded successfully from online source\")\n",
    "except Exception as e:\n",
    "    print(f\"Online loading failed: {e}\")\n",
    "    try:\n",
    "        data = pd.read_csv(\"owid-co2-data.csv\")\n",
    "        print(\"Loaded local dataset instead\")\n",
    "    except:\n",
    "        print(\"Could not load dataset. Please check file path or internet connection.\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Interactive Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create interactive country selector\n",
    "country_list = data['country'].unique().tolist()\n",
    "country_dropdown = widgets.Dropdown(\n",
    "    options=country_list,\n",
    "    value='Kenya',\n",
    "    description='Select Country:',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "display(country_dropdown)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter data for selected country\n",
    "selected_country = country_dropdown.value\n",
    "country_data = data[data['country'] == selected_country].copy()\n",
    "\n",
    "# Show basic stats\n",
    "print(f\"\\nData Overview for {selected_country}:\")\n",
    "print(f\"Time Range: {country_data['year'].min()} to {country_data['year'].max()}\")\n",
    "print(f\"Available Years: {len(country_data)}\")\n",
    "\n",
    "# Plot historical emissions\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.lineplot(x='year', y='co2', data=country_data, marker='o')\n",
    "plt.title(f'Historical CO2 Emissions in {selected_country}')\n",
    "plt.xlabel('Year')\n",
    "plt.ylabel('CO2 Emissions (kt)')\n",
    "plt.grid(True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select features and target\n",
    "features = ['year', 'gdp', 'population', 'energy_use', 'cement_co2', 'coal_co2']\n",
    "target = 'co2'\n",
    "\n",
    "# Create cleaned dataset\n",
    "model_data = country_data[features + [target]].dropna()\n",
    "\n",
    "# Add engineered features\n",
    "model_data['gdp_per_capita'] = model_data['gdp'] / model_data['population']\n",
    "model_data['energy_intensity'] = model_data['energy_use'] / model_data['gdp']\n",
    "\n",
    "# Update features list\n",
    "features += ['gdp_per_capita', 'energy_intensity']\n",
    "\n",
    "# Show correlation matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(model_data.corr(), annot=True, cmap='coolwarm', center=0)\n",
    "plt.title('Feature Correlation Matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Model Training & Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare data\n",
    "X = model_data[features]\n",
    "y = model_data[target]\n",
    "\n",
    "# Train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "# Scale features\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "# Initialize models\n",
    "models = {\n",
    "    \"Linear Regression\": LinearRegression(),\n",
    "    \"Ridge Regression\": Ridge(alpha=1.0),\n",
    "    \"Random Forest\": RandomForestRegressor(n_estimators=100, random_state=42)\n",
    "}\n",
    "\n",
    "# Train and evaluate\n",
    "results = {}\n",
    "for name, model in models.items():\n",
    "    model.fit(X_train_scaled, y_train)\n",
    "    y_pred = model.predict(X_test_scaled)\n",
    "    \n",
    "    results[name] = {\n",
    "        'model': model,\n",
    "        'mae': mean_absolute_error(y_test, y_pred),\n",
    "        'r2': r2_score(y_test, y_pred),\n",
    "        'predictions': y_pred\n",
    "    }\n",
    "    \n",
    "    print(f\"{name}:\")\n",
    "    print(f\"- MAE: {results[name]['mae']:.2f}\")\n",
    "    print(f\"- R²: {results[name]['r2']:.2f}\")\n",
    "    print(\"-\" * 30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Results Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Determine best model\n",
    "best_model_name = max(results, key=lambda x: results[x]['r2'])\n",
    "best_result = results[best_model_name]\n",
    "\n",
    "# Plot predictions vs actual\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.scatterplot(x=y_test, y=best_result['predictions'])\n",
    "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')\n",
    "plt.title(f'Actual vs Predicted CO2 Emissions\\n({best_model_name}, R²={best_result[\"r2\"]:.2f})')\n",
    "plt.xlabel('Actual Emissions (kt)')\n",
    "plt.ylabel('Predicted Emissions (kt)')\n",
    "plt.grid(True)\n",
    "plt.show()\n",
    "\n",
    "# Feature importance for tree-based models\n",
    "if hasattr(best_result['model'], 'feature_importances_'):\n",
    "    importance = best_result['model'].feature_importances_\n",
    "    feat_importance = pd.DataFrame({\n",
    "        'Feature': features,\n",
    "        'Importance': importance\n",
    "    }).sort_values('Importance', ascending=False)\n",
    "    \n",
    "    plt.figure(figsize=(10, 5))\n",
    "    sns.barplot(x='Importance', y='Feature', data=feat_importance)\n",
    "    plt.title('Feature Importance for CO2 Emissions Prediction')\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Model Deployment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save model artifacts\n",
    "joblib.dump(best_result['model'], f'co2_model_{selected_country}.pkl')\n",
    "joblib.dump(scaler, f'scaler_{selected_country}.pkl')\n",
    "\n",
    "print(f\"\\nSaved best model ({best_model_name}) and scaler for {selected_country}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Ethical Considerations & Conclusion"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Key Ethical Considerations:\n",
    "\n",
    "1. **Data Bias**: \n",
    "   - Developing countries often have less complete data\n",
    "   - Solution: Cross-validate with multiple data sources\n",
    "\n",
    "2. **Model Fairness**:\n",
    "   - Predictions should be tested across different economic contexts\n",
    "   - Solution: Build separate models for country groupings\n",
    "\n",
    "3. **Policy Impact**:\n",
    "   - Models should inform equitable climate policies\n",
    "   - Solution: Include social impact assessments\n",
    "\n",
    "4. **Transparency**:\n",
    "   - Decision-makers need to understand model limitations\n",
    "   - Solution: Provide clear documentation and uncertainty estimates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Future Enhancements:\n",
    "\n",
    "- Add real-time data integration through APIs\n",
    "- Build interactive dashboard for policymakers\n",
    "- Include climate justice metrics in predictions\n",
    "- Expand to predict emission reduction scenarios"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}