In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# --- Configuration ---
DATA_FILE = 'owid-co2-data.csv'
PREPROCESSED_DATA_FILE = 'preprocessed_data.csv'
MODEL_FILE = 'ghg_model.pkl'
SCALER_FILE = 'ghg_scaler.pkl'

# --- Step 1: Load the dataset ---
try:
    df = pd.read_csv(DATA_FILE)
    print(f"✅ Dataset '{DATA_FILE}' loaded successfully.")
except FileNotFoundError:
    print(f"❌ Error: '{DATA_FILE}' not found. Please ensure the file is in the correct directory.")
    # Fallback to downloading if not found
    url = "https://github.com/owid/co2-data/raw/master/owid-co2-data.csv"
    df = pd.read_csv(url)
    df.to_csv(DATA_FILE, index=False)
    print(f"✅ File downloaded and saved as '{DATA_FILE}'")
except Exception as e:
    print(f"❌ An error occurred while loading the data: {e}")
    exit() # Exit if data cannot be loaded

# --- Step 2: Select relevant columns and sort data ---
# We need country, year, population, gdp, and various emissions types
# for total_ghg prediction.
columns_to_keep = [
    'country', 'year', 'population', 'gdp',
    'total_ghg', 'co2', 'methane', 'nitrous_oxide'
]
df_processed = df[columns_to_keep].copy()
df_processed = df_processed.sort_values(by=['country', 'year']).reset_index(drop=True)
print("✅ Relevant columns selected and data sorted by country and year.")

# --- Step 3: Handle missing values ---
# Interpolate missing values within each country group
for col in ['total_ghg', 'co2', 'methane', 'nitrous_oxide', 'population', 'gdp']:
    df_processed[col] = df_processed.groupby('country')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both'))
print("✅ Missing values interpolated within each country group.")

# --- Step 4: Feature Engineering ---
# Create lagged features for total_ghg, co2, methane, and nitrous_oxide
# Lag features are crucial for time-series prediction
lag_features = ['total_ghg', 'co2', 'methane', 'nitrous_oxide']
for feature in lag_features:
    for i in range(1, 4): # Create lags for 1, 2, and 3 years
        df_processed[f'{feature}_lag{i}'] = df_processed.groupby('country')[feature].shift(i)
print("✅ Lagged features created for GHG, CO2, Methane, and Nitrous Oxide.")

# Drop rows that have NaN values due to lagging (these will be the first few years for each country)
initial_rows_count = len(df_processed)
df_processed.dropna(inplace=True)
df_processed = df_processed.reset_index(drop=True)
print(f"✅ Dropped {initial_rows_count - len(df_processed)} rows with NaN values after lagging.")

# --- Step 5: Define features (X) and target (y) ---
# The target variable is 'total_ghg'
target_column = 'total_ghg'

# Define input features for the model.
# These match the 'input_features' list in your original app.py.
input_features = [
    'population', 'gdp',
    'total_ghg_lag1', 'total_ghg_lag2', 'total_ghg_lag3',
    'co2_lag1', 'methane_lag1', 'nitrous_oxide_lag1'
    # 'year' can also be added as a feature, but for simplicity and to match original app.py,
    # we'll use the ones listed in its input_features.
    # Note: 'total_ghg' was in the original app.py's input_features, but it's the target.
    # We should exclude it from X. If it was meant as a 'current year' feature, it would be data leakage.
]

# Ensure all defined input features exist in the DataFrame
missing_features = [f for f in input_features if f not in df_processed.columns]
if missing_features:
    print(f"❌ Error: Missing features in preprocessed data: {missing_features}. Please check feature engineering steps.")
    exit()

X = df_processed[input_features]
y = df_processed[target_column]

print("✅ Features (X) and target (y) defined.")

# --- Step 6: Split data into training and testing sets ---
# Using a simple train-test split for model training
# For time-series data, a time-based split is often better, but for general country prediction
# and to align with the original app's logic, a random split is used here.
# You might consider using TimeSeriesSplit if you want to strictly evaluate forecasting ability.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_processed['country'] if 'country' in df_processed.columns else None)
# Stratify by country to ensure each country's data is proportionally represented in train/test sets.
# Remove stratify if 'country' is not a column in X (which it shouldn't be for the model input).
print("✅ Data split into training and testing sets.")

# --- Step 7: Scale the features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled using StandardScaler.")

# --- Step 8: Initialize and train the XGBoost Regressor model ---
# Using XGBoost as it generally performs well and was used in our previous iterations.
model = XGBRegressor(objective='reg:squarederror',
                     n_estimators=1000,
                     learning_rate=0.05,
                     max_depth=5,
                     subsample=0.7,
                     colsample_bytree=0.7,
                     random_state=42,
                     n_jobs=-1)

model.fit(X_train_scaled, y_train)
print("✅ XGBoost Regressor model trained.")

# --- Step 9: Evaluate the model ---
y_pred = model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\n--- Model Evaluation ---")
print(f"✅ R² Score: {r2:.4f}")
print(f"✅ MAE: {mae:.2f}")
print(f"✅ RMSE: {rmse:.2f}")

# --- Step 10: Save the preprocessed data, model, and scaler ---
try:
    df_processed.to_csv(PREPROCESSED_DATA_FILE, index=False)
    print(f"📁 Preprocessed data saved as: {PREPROCESSED_DATA_FILE}")

    joblib.dump(model, MODEL_FILE)
    print(f"📁 Model saved as: {MODEL_FILE}")

    joblib.dump(scaler, SCALER_FILE)
    print(f"📁 Scaler saved as: {SCALER_FILE}")

except Exception as e:
    print(f"❌ An error occurred while saving assets: {e}")

print("\n--- Setup Complete ---")
print("You can now use these files with your Streamlit app.")


❌ Error: 'owid-co2-data.csv' not found. Please ensure the file is in the correct directory.
✅ File downloaded and saved as 'owid-co2-data.csv'
✅ Relevant columns selected and data sorted by country and year.
✅ Missing values interpolated within each country group.
✅ Lagged features created for GHG, CO2, Methane, and Nitrous Oxide.
✅ Dropped 18606 rows with NaN values after lagging.
✅ Features (X) and target (y) defined.
✅ Data split into training and testing sets.
✅ Features scaled using StandardScaler.
✅ XGBoost Regressor model trained.

--- Model Evaluation ---
✅ R² Score: 0.9338
✅ MAE: 30.91
✅ RMSE: 533.95
📁 Preprocessed data saved as: preprocessed_data.csv
📁 Model saved as: ghg_model.pkl
📁 Scaler saved as: ghg_scaler.pkl

--- Setup Complete ---
You can now use these files with your Streamlit app.


In [4]:
joblib.dump(model, MODEL_FILE)
print(f"📁 Model saved as: {MODEL_FILE}")

joblib.dump(scaler, SCALER_FILE)
print(f"📁 Scaler saved as: {SCALER_FILE}")

📁 Model saved as: ghg_model.pkl
📁 Scaler saved as: ghg_scaler.pkl
