### Data Importation 

In [1]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO

# Define the analysis period for filtering
START_YEAR = 2015
END_YEAR = 2025

# Attempt to load a static GHG dataset robustly
# (Using the stable alternative since the signed URL can expire)
alt_url = "https://raw.githubusercontent.com/datasets/co2-fossil-global/master/global.csv"

try:
    # Use the stable public dataset (global.csv)
    ghg_data = pd.read_csv(alt_url)
    print(f"Loaded GHG dataset from {alt_url}")
except Exception as e_alt:
    print(f"Fallback source failed: {e_alt}. Creating a minimal sample dataset.")
    ghg_data = pd.DataFrame({
        'Country': ['USA', 'China', 'India', 'USA'],
        'Year': [2018, 2019, 2019, 2019],
        'Annual CO₂ emissions': [5000, 10000, 2500, 5100] # Use the actual column name from the fallback
    })

# Standardize columns to simplify downstream code
rename_dict = {}
for col in ghg_data.columns:
    if 'Country' in col or 'Entity' in col:
        rename_dict[col] = 'Country'
    elif 'Year' in col:
        rename_dict[col] = 'Year'
    elif 'emissions' in col or 'Emissions' in col:
        rename_dict[col] = 'Emissions_kt'

ghg_data.rename(columns=rename_dict, inplace=True)

# Keep only essential columns
if all(col in ghg_data.columns for col in ['Country', 'Year', 'Emissions_kt']):
    ghg_data = ghg_data[['Country', 'Year', 'Emissions_kt']].copy()
    
print("Initial GHG data columns:", ghg_data.columns.tolist())
print(f"Initial GHG data shape: {ghg_data.shape}")


Loaded GHG dataset from https://raw.githubusercontent.com/datasets/co2-fossil-global/master/global.csv
Initial GHG data columns: ['Year', 'Total', 'Gas Fuel', 'Liquid Fuel', 'Solid Fuel', 'Cement', 'Gas Flaring', 'Per Capita']
Initial GHG data shape: (260, 8)


# Data Cleaning and Filtering 

In [2]:
# --- Clean Data Types and Filter Years ---

# Ensure there is a usable emissions column: try to locate candidate columns and normalize to 'Emissions_kt'
if 'Emissions_kt' not in ghg_data.columns:
	# Prefer columns that contain 'annual' + 'co2' / 'emiss' or direct emission-like names
	candidates = []
	for col in ghg_data.columns:
		lc = str(col).lower()
		if 'annual' in lc and ('co2' in lc or 'co₂' in lc or 'emiss' in lc):
			candidates.append(col)
		elif 'emiss' in lc and 'per' not in lc:
			candidates.append(col)
		elif lc in ('co2', 'co2_kt', 'emissions', 'total_co2'):
			candidates.append(col)
	# fallback: any column containing 'co2' (may pick the most relevant available)
	if not candidates:
		any_co2 = [col for col in ghg_data.columns if 'co2' in str(col).lower() or 'co₂' in str(col).lower()]
		candidates = any_co2

	if candidates:
		chosen = candidates[0]
		ghg_data.rename(columns={chosen: 'Emissions_kt'}, inplace=True)
		print(f"Renamed '{chosen}' -> 'Emissions_kt' for cleaning.")
	else:
		# If nothing sensible was found, create the column to avoid KeyError (it will be NaN)
		ghg_data['Emissions_kt'] = np.nan
		print("No emissions-like column found; created 'Emissions_kt' with NaN values.")

# 1. Clean 'Emissions_kt' column
# Clean out common non-numeric characters before conversion
ghg_data['Emissions_kt'] = ghg_data['Emissions_kt'].astype(str).str.replace(',', '', regex=False).str.replace(' ', '', regex=False)
ghg_data['Emissions_kt'] = ghg_data['Emissions_kt'].replace(['-', '..', 'NA', 'None', ''], np.nan)
ghg_data['Emissions_kt'] = pd.to_numeric(ghg_data['Emissions_kt'], errors='coerce')

# 2. Clean 'Year' column
if 'Year' in ghg_data.columns:
	ghg_data['Year'] = pd.to_numeric(ghg_data['Year'], errors='coerce').astype('Int64')
else:
	# try common alternatives
	year_candidates = [c for c in ghg_data.columns if 'year' in str(c).lower()]
	if year_candidates:
		ghg_data.rename(columns={year_candidates[0]: 'Year'}, inplace=True)
		ghg_data['Year'] = pd.to_numeric(ghg_data['Year'], errors='coerce').astype('Int64')
		print(f"Renamed '{year_candidates[0]}' -> 'Year' for cleaning.")
	else:
		ghg_data['Year'] = pd.NA
		print("No 'Year' column found; created 'Year' with NA values.")

# 3. Drop genuine missing values and filter years
ghg_data.dropna(subset=['Year', 'Emissions_kt'], inplace=True)
if not ghg_data.empty:
	ghg_data = ghg_data[(ghg_data['Year'] >= START_YEAR) & (ghg_data['Year'] <= END_YEAR)].copy()
else:
	print("Warning: ghg_data is empty after dropping NaNs for 'Year' and 'Emissions_kt'.")

print("\n--- Step 2: Cleaned and Filtered GHG Data ---")
print(f"Remaining rows after cleanup: {len(ghg_data)}")
# to_markdown can fail if not available; guard with fallback
try:
	print(ghg_data.head().to_markdown(index=False))
except Exception:
	print(ghg_data.head())


No emissions-like column found; created 'Emissions_kt' with NaN values.

--- Step 2: Cleaned and Filtered GHG Data ---
Remaining rows after cleanup: 0
| Year   | Total   | Gas Fuel   | Liquid Fuel   | Solid Fuel   | Cement   | Gas Flaring   | Per Capita   | Emissions_kt   |
|--------|---------|------------|---------------|--------------|----------|---------------|--------------|----------------|


# Transformation to a Macro Carbon Factor 

In [6]:
# 1. Find the total global emissions for each year
global_emissions_annual = ghg_data.groupby('Year')['Emissions_kt'].sum().reset_index()

# 2. Calculate the annual growth rate (percentage change)
# This represents the yearly shock/change in global carbon output.
global_emissions_annual['GHG_Growth_Factor'] = global_emissions_annual['Emissions_kt'].pct_change()

# 3. Clean up the resulting factor series
macro_ghg_factor_annual = global_emissions_annual.set_index('Year')['GHG_Growth_Factor'].dropna()

print("\n--- Step 3: Macro Carbon Factor (Annual Growth Rate) ---")
print(macro_ghg_factor_annual.to_markdown(numalign="left", stralign="left"))



--- Step 3: Macro Carbon Factor (Annual Growth Rate) ---
| Year   | GHG_Growth_Factor   |
|--------|---------------------|


# Daily Alignment 

In [4]:
# --- Prepare to align with daily data ---

# 1. Convert the annual factor index (Year) to the last day of the year
# We use December 31st for the corresponding year.
ghg_factor_daily_index = pd.to_datetime(macro_ghg_factor_annual.index.astype(str) + '-12-31')

# 2. Create a daily series indexed by the year-end date
ghg_factor_daily = pd.Series(macro_ghg_factor_annual.values, index=ghg_factor_daily_index)

# 3. Reindex the series to cover the full range of your analysis, and use FFILL.
# FFILL (Forward Fill) carries the annual change forward until the next year's change is available.
FULL_DATE_RANGE = pd.date_range(start='2015-01-01', end='2025-10-31', freq='D')
ghg_factor_daily_aligned = ghg_factor_daily.reindex(FULL_DATE_RANGE).ffill()

# We need to trim off any NaNs remaining at the start (if the first year is missing a growth rate)
ghg_factor_daily_aligned.dropna(inplace=True)

print("\n--- Step 4: Daily Aligned GHG Factor (First 10 Days of 2016) ---")
print("Note how the value is constant until the next annual figure.")
print(ghg_factor_daily_aligned.head(10).to_markdown(numalign="left", stralign="left"))



--- Step 4: Daily Aligned GHG Factor (First 10 Days of 2016) ---
Note how the value is constant until the next annual figure.
| 0   |
|-----|


# Saved GHG_macro csv

In [5]:
macro_ghg_factor_annual.to_csv("ghg_macro.csv")
print("Saved ghg_macro.csv")

Saved ghg_macro.csv
